import pandas as pd
import numpy as np
cic_df=pd.read_parquet("..//cic/cic-collection.parquet")
cic_df.head()
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Active Mean | Active Std | Active Max | Active Min | Idle Mean | Idle Std | Idle Max | Idle Min | Label | ClassLabel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.00000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 1 | 1 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.00000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 2 | 3 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.00000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 3 | 1 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.00000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 4 | 609 | 7 | 4 | 484.0 | 414.0 | 233.0 | 69.14286 | 111.967896 | 207.0 | 103.5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
5 rows × 59 columns
cic_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9167581 entries, 0 to 9167580 Data columns (total 59 columns): # Column Dtype --- ------ ----- 0 Flow Duration int64 1 Total Fwd Packets int32 2 Total Backward Packets int32 3 Fwd Packets Length Total float64 4 Bwd Packets Length Total float64 5 Fwd Packet Length Max float64 6 Fwd Packet Length Mean float32 7 Fwd Packet Length Std float32 8 Bwd Packet Length Max float64 9 Bwd Packet Length Mean float32 10 Bwd Packet Length Std float32 11 Flow Bytes/s float64 12 Flow Packets/s float64 13 Flow IAT Mean float32 14 Flow IAT Std float32 15 Flow IAT Max float64 16 Flow IAT Min float64 17 Fwd IAT Total float64 18 Fwd IAT Mean float32 19 Fwd IAT Std float32 20 Fwd IAT Max float64 21 Fwd IAT Min float64 22 Bwd IAT Total float64 23 Bwd IAT Mean float32 24 Bwd IAT Std float32 25 Bwd IAT Max float64 26 Bwd IAT Min float64 27 Fwd PSH Flags int8 28 Fwd Header Length int64 29 Bwd Header Length int64 30 Fwd Packets/s float32 31 Bwd Packets/s float32 32 Packet Length Max float64 33 Packet Length Mean float32 34 Packet Length Std float32 35 Packet Length Variance float32 36 SYN Flag Count int8 37 URG Flag Count int8 38 Avg Packet Size float32 39 Avg Fwd Segment Size float32 40 Avg Bwd Segment Size float32 41 Subflow Fwd Packets int32 42 Subflow Fwd Bytes int32 43 Subflow Bwd Packets int32 44 Subflow Bwd Bytes int32 45 Init Fwd Win Bytes int32 46 Init Bwd Win Bytes int32 47 Fwd Act Data Packets int32 48 Fwd Seg Size Min int32 49 Active Mean float32 50 Active Std float32 51 Active Max float64 52 Active Min float64 53 Idle Mean float32 54 Idle Std float32 55 Idle Max float64 56 Idle Min float64 57 Label object 58 ClassLabel object dtypes: float32(22), float64(19), int32(10), int64(3), int8(3), object(2) memory usage: 2.8+ GB
cic_df.shape
(9167581, 59)
cic_df.isna().sum()
Flow Duration 0 Total Fwd Packets 0 Total Backward Packets 0 Fwd Packets Length Total 0 Bwd Packets Length Total 0 Fwd Packet Length Max 0 Fwd Packet Length Mean 0 Fwd Packet Length Std 0 Bwd Packet Length Max 0 Bwd Packet Length Mean 0 Bwd Packet Length Std 0 Flow Bytes/s 0 Flow Packets/s 0 Flow IAT Mean 0 Flow IAT Std 0 Flow IAT Max 0 Flow IAT Min 0 Fwd IAT Total 0 Fwd IAT Mean 0 Fwd IAT Std 0 Fwd IAT Max 0 Fwd IAT Min 0 Bwd IAT Total 0 Bwd IAT Mean 0 Bwd IAT Std 0 Bwd IAT Max 0 Bwd IAT Min 0 Fwd PSH Flags 0 Fwd Header Length 0 Bwd Header Length 0 Fwd Packets/s 0 Bwd Packets/s 0 Packet Length Max 0 Packet Length Mean 0 Packet Length Std 0 Packet Length Variance 0 SYN Flag Count 0 URG Flag Count 0 Avg Packet Size 0 Avg Fwd Segment Size 0 Avg Bwd Segment Size 0 Subflow Fwd Packets 0 Subflow Fwd Bytes 0 Subflow Bwd Packets 0 Subflow Bwd Bytes 0 Init Fwd Win Bytes 0 Init Bwd Win Bytes 0 Fwd Act Data Packets 0 Fwd Seg Size Min 0 Active Mean 0 Active Std 0 Active Max 0 Active Min 0 Idle Mean 0 Idle Std 0 Idle Max 0 Idle Min 0 Label 0 ClassLabel 0 dtype: int64
cic_df[cic_df.duplicated()]
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Active Mean | Active Std | Active Max | Active Min | Idle Mean | Idle Std | Idle Max | Idle Min | Label | ClassLabel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 321783 | 1 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 346078 | 2 | 3 | 0 | 18.0 | 0.0 | 6.0 | 6.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 361731 | 2 | 1 | 1 | 6.0 | 6.0 | 6.0 | 6.0 | 0.0 | 6.0 | 6.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 366568 | 4 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 369416 | 3 | 3 | 0 | 18.0 | 0.0 | 6.0 | 6.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9030704 | 1 | 3 | 0 | 18.0 | 0.0 | 6.0 | 6.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 9036659 | 158 | 1 | 1 | 6.0 | 6.0 | 6.0 | 6.0 | 0.0 | 6.0 | 6.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 9111903 | 644 | 1 | 1 | 6.0 | 6.0 | 6.0 | 6.0 | 0.0 | 6.0 | 6.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 9134426 | 428 | 1 | 1 | 6.0 | 6.0 | 6.0 | 6.0 | 0.0 | 6.0 | 6.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
| 9163553 | 11 | 2 | 1 | 12.0 | 6.0 | 6.0 | 6.0 | 0.0 | 6.0 | 6.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign |
310 rows × 59 columns
cic_df['Label'].value_counts()
Label Benign 7186189 DDoS-LOIC-HTTP 575364 DoS-Hulk 318740 DDoS-HOIC 198861 Botnet 145968 DDoS 128062 DDoS-NTP 121328 DDoS-TFTP 98833 Bruteforce-SSH 97260 Infiltration 94857 DoS-Goldeneye 52324 DDoS-Syn 47757 DDoS-UDP 28863 DoS-Slowloris 15243 DDoS-MSSQL 11784 DDoS-UDPLag 8452 Bruteforce-FTP 5984 DoS-Slowhttptest 5271 DDoS-Ddossim 5115 DDoS-DNS 3668 DoS-Slowread 2786 Portscan 2255 DDoS-LDAP 2092 Webattack-bruteforce 2020 DDoS-SNMP 2017 DDoS-Slowloris 1858 DoS-Slowheaders 1649 Webattack-XSS 876 DoS-Rudy 699 DDoS-NetBIOS 675 DoS-Slowbody 621 Webattack-SQLi 99 DoS-Heartbleed 11 Name: count, dtype: int64
cic_df['ClassLabel'].value_counts()
ClassLabel Benign 7186189 DDoS 1234729 DoS 397344 Botnet 145968 Bruteforce 103244 Infiltration 94857 Webattack 2995 Portscan 2255 Name: count, dtype: int64
#Removing the duplicate records
cic_df.drop_duplicates(inplace=True)
cic_df.shape
(9167271, 59)
- Number of rows with duplicate records: 9167581
- Number of rows after removal of duplicate records: 9167271
cic_df['Label'].value_counts()
Label Benign 7185881 DDoS-LOIC-HTTP 575364 DoS-Hulk 318740 DDoS-HOIC 198861 Botnet 145968 DDoS 128062 DDoS-NTP 121326 DDoS-TFTP 98833 Bruteforce-SSH 97260 Infiltration 94857 DoS-Goldeneye 52324 DDoS-Syn 47757 DDoS-UDP 28863 DoS-Slowloris 15243 DDoS-MSSQL 11784 DDoS-UDPLag 8452 Bruteforce-FTP 5984 DoS-Slowhttptest 5271 DDoS-Ddossim 5115 DDoS-DNS 3668 DoS-Slowread 2786 Portscan 2255 DDoS-LDAP 2092 Webattack-bruteforce 2020 DDoS-SNMP 2017 DDoS-Slowloris 1858 DoS-Slowheaders 1649 Webattack-XSS 876 DoS-Rudy 699 DDoS-NetBIOS 675 DoS-Slowbody 621 Webattack-SQLi 99 DoS-Heartbleed 11 Name: count, dtype: int64
- Number of rows with Label=Benign prior removal of duplicates: 7186189.
- Number of rows with Label=Benign after removal of duplicates: 7185881.
- Number of rows with Label=DDoS-NTP prior removal of duplicates: 121328.
- Number of rows with Label=DDOS-NTP after removal of duplicates: 121326.
Thus: -
- 0.0042% of duplicate records for Label=Benign were removed.
- 0.0016% of duplicate records for Label=DDOS-NTP were removed.
We observed that very small proportion of duplicate records were removed for both dominant class and minority class in the dataset; Overall the distribution of rows with respect to field: Label has remained same.
cic_df['ClassLabel'].value_counts()
ClassLabel Benign 7185881 DDoS 1234727 DoS 397344 Botnet 145968 Bruteforce 103244 Infiltration 94857 Webattack 2995 Portscan 2255 Name: count, dtype: int64
- Number of rows with ClassLabel=Benign prior removal of duplicates: 7186189.
- Number of rows with ClassLabel=Benign after removal of duplicates: 7185881.
- Number of rows with ClassLabel=DDOS prior removal of duplicates: 1234729.
- Number of rows with ClassLabel=DDOS after removal of duplicates: 1234727.
Thus: -
- 0.0042% of duplicate records for ClassLabel=Benign were removed.
- 0.00016% of duplicate records for ClassLabel=DDOS were removed.
We observed that very small proportion of duplicate records were removed for both dominant class and minority class in the dataset; Overall the distribution of rows with respect to field: ClassLabel has remained same.
import matplotlib.pyplot as plt
axes=cic_df.hist(bins=50, figsize=(50,50))
for ax, col in zip(axes.flatten(), cic_df.columns):
ax.set_xlabel("Values")
ax.set_ylabel("Frequency")
ax.set_title(col)
plt.show()
axes=cic_df.hist(bins=50, figsize=(50,50), log=True)
for ax, col in zip(axes.flatten(), cic_df.columns):
ax.set_xlabel("Values")
ax.set_ylabel("Frequency w.r.t log scale")
ax.set_title(col)
plt.show()
# Get the list of columns excluding 'Label' and 'ClassLabel'
columns = [col for col in cic_df.columns if col not in ['Label', 'ClassLabel']]
# Create subplots for each column
fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=(10, len(columns) * 5))
# Plot each column's histogram in a separate subplot
for i, column in enumerate(columns):
cic_df[column].hist(bins=50, ax=axes[i])
axes[i].set_xlabel("Values")
axes[i].set_ylabel("Frequency")
axes[i].set_title(column)
plt.tight_layout()
plt.show()
- In the above graphs, the scale is too vast to fit all values and observe the distributions for each feature in the dataset.
- Thus, below we shall plot the graphs with Logarithmic scale. It will not give us actual values, but will preserve the distribution pattern and give us more understanding about each feature.
# Create subplots for each column
fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=(10, len(columns) * 5))
# Plot each column's histogram in a separate subplot
for i, column in enumerate(columns):
cic_df[column].hist(bins=50, ax=axes[i], log=True)
axes[i].set_xlabel("Values")
axes[i].set_ylabel("Frequency w.r.t log scale")
axes[i].set_title(column)
plt.tight_layout()
plt.show()
Observations and interpretations from above Histograms with Logarithmic scale: -
Flow Duration: The duration of the flow
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
- Peak was observed at extreme right, Flow Duration=0.
- There are some scattered bins of count=1
Total Fwd Packets: Total number of forward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from the left, after which we saw sharp decline.
- There is another small peak around Total Fwd Packets=125000, but it is in plateau shape. Thus, we see many values around 125000.
- The first bin (Peak) is in the range around 0 to 6250.
- After the second bin , there is consistent decline.
- Since there are two peaks at significant distance apart, we can also call the graph bi-modal.
- We observed value for Total Fwd Packets>300000. This may indicate outlier in the data.
Total Backward Packets: Total number of backward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from left, Total Backward Packets: 0 to 6250.
- After the peak, there is significant decline in results.
- Some records were observed at regular intervals but with very less frequency.
Fwd Packets Length Total: Total length of forward packets
- Peak was observed on first bin from left.
- Most values are stacked on the left side of X-axis and they continuously decline as we move towards right hand side of X-axis.
- There are a couple of observations at a distance on right hand side after long gap. They may indicate outliers in the data.
Bwd Packets Length Total: Total length of backward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from left.
- After the peak, there is significant decline in results.
- There are some observations spread out on X-axis, but all have frequency less than 10.
Fwd Packet Length Max: Maximum length of forward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from left.
- The first bin (Peak) is in the range around 0 to 1250.
- Most number of observations lie between Fwd Packet Length Max>0 and Fwd Packet Length Max<10000.
- A small peak was observed around Fwd Packet Length Max>20000 and Fwd Packet Length Max<300000. However the frequency is relatively very less compared to the peak observed in first bin.
- There are some observations around Fwd Packet Length Max=60000. This may indicate outliers in the data.
Fwd Packet Length Mean: Mean length of forward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from left.
- Most number of observations lie between Fwd Packet Length Mean>=0 and Fwd Packet Length Mean<=2500.
- There are some small number of observations around Fwd Packet Length Mean=15000 and above. This may indicate outliers in the data.
Fwd Packet Length Std: Standard deviation length of forward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from left.
- Most number of observations lie between Fwd Packet Length Std>=0 and Fwd Packet Length Std<=5000.
- There are some very small number of observations at Fwd Packet Length Std>7500. This may indicate outliers in the data.
Bwd Packet Length Max: Maximum length of backward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from left. Peak lies around Bwd Packet Length Max>=0 and Bwd Packet Length Max<=1250.
- Most number of observations lie between Bwd Packet Length Max>=0 and Bwd Packet Length Max<=10000.
- There are few observations in the range: - Bwd Packet Length Max>=11250 and Bwd Packet Length Max<=20000, Bwd Packet Length Max>=30000 and Bwd Packet Length Max<=35000.
- There is an observation at Bwd Packet Length Max>60000. This may indicate outliers in the data.
Bwd Packet Length Mean: Mean length of backward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from left. Peak lies around Bwd Packet Length Mean>=0 and Bwd Packet Length Mean<=666.67.
- After the peak, there is significant decline in results.
- Between Bwd Packet Length Mean=0 and Bwd Packet Length Mean=5000, we observed J-shaped graph.
- There is an observation at Bwd Packet Length Mean=35000. This may indicate outliers in the data.
Bwd Packet Length Std: Standard deviation length of backward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed on first bin from left. Peak lies around Bwd Packet Length Std>=0 and Bwd Packet Length Std<=416.67.
- After the peak, there is significant decline in results.
- There is plateau region observed around Bwd Packet Length Std>=2083 and Bwd Packet Length Std<=2500.
- There is another plateau region observed (smaller than the above) around Bwd Packet Length Std>=3750 and Bwd Packet Length Std<=4166.
- There is an observation at Bwd Packet Length Std>20000. This may indicate outliers in the data.
Flow Bytes/s: Flow bytes per second
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Flow Bytes/s=0.
- After the peak, there is consistent decline in results.
- Towards right hand side of the graph, there is increase in number of observations compared to other bins prior to it excluding the peak.
- Between the two extremes of the graph there were some plateau regions.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
Flow Packets/s: Flow packets per second
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Flow Packets/s=0
- After the peak, there is consistent decline in results.
- At Flow Packets/s=2 and Flow Packets/s=3, there relatively small peaks.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
Flow IAT Mean: Mean time between flows
- Peak was observed at Flow IAT Mean=0.
- Most values are concenterated in bin represented by the peak.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
Flow IAT Std: Standard deviation of time between flows
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Flow IAT Std=0.
- Most values are concenterated in bin represented by the peak.
- There are a few observations in the range: - Flow IAT Std>=2 and Flow IAT Std<=3, Flow IAT Std>=3 and Flow IAT Std<=4 and Flow IAT Std>4.
Flow IAT Max: Maximum time between flows
- Peak was observed around Flow IAT Max=0.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
- On X-axis values lie in the range -1.0 to +1.0
Flow IAT Min: Minimum time between flows
- Peak was observed around Flow IAT Min=0.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
Fwd IAT Total: Total time between forward packets
- Peak was observed around Fwd IAT Total=0.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
Fwd IAT Mean: Mean time between forward packets
- Peak was observed around Fwd IAT Mean=0.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
Fwd IAT Std: Standard deviation of time between forward packets
- Peak was observed around Fwd IAT Std=0.
- There are small number of observations in the range: Fwd IAT Std>=2 and Fwd IAT Std<=3, Fwd IAT Std>=3 and Fwd IAT Std<=4, Fwd IAT Std>4.
Fwd IAT Max: Maximum time between forward packets
- Peak was observed around Fwd IAT Max=0.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
- There are scattered but very small number of observations between Fwd IAT Max=0.0 and Fwd IAT Max=1.0
Fwd IAT Min: Minimum time between forward packets
- Peak was observed around Fed IAT Min=0.
- We observed negative values on X-axis, thus, we need to check the actual values under the column to determine if data is accurate or invalid.
Bwd IAT Total: Total time between backward packets
- Peak was observed around Bwd IAT Total=0.
- After the peak, there is consistent decline in results.
- There are relatively smaller peaks at Bwd IAT Total=0.6 and Bwd IAT Total=1.125
- There was a plateau region observed between Bwd IAT Total>=0.625 and Bwd IAT Total<=0.675
Bwd IAT Mean: Mean time between backward packets
- Peak was observed around Bwd IAT Mean=0.
- After the peak, there is consistent decline in results.
- Most observations are stacked on left side of the graph, near the peak.
- On X-axis values lie in the range 0.0 to +1.2
Bwd IAT Std: Standard deviation of time between packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Bwd IAT Std=0
- After the peak, there is consistent decline in results.
- There was plateau region observed between Bwd IAT Std>=1.169 and Bwd IAT Std=2
- As the value of Bwd IAT Std increases, the size of bins decreases. In between there are a few exceptions where size of bin is greater than their neighbors.
Bwd IAT Max: Maximum time between packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Bwd IAT Max=0.0
- After the peak, there is consistent decline in results.
- There are relatively smaller peaks at Bwd IAT Max=0.125 and Bwd IAT Max=0.575
- Since there are multiple peaks at significant distance apart, we can also call the graph multi-modal.
- The bins prior and after all three peaks are very small.
Bwd IAT Min: Minimum time between packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Bwd IAT Min=0
- After the peak, there is significant decline in results.
- On X-axis values lie in the range 0.0 to 1.2
Fwd PSH Flags: Forward packets with PUSH flags
- Most of the values are concenterated in the first bin at Fwd PSH Flags=0.0
- There were few observations at Fwd PSH Flags=1.0. This may indicate outlier in the data.
- There were no results between Fwd PSH Flags=0.0 and Fwd PSH Flags=1.0
Fwd Header Length: Length of header in forward packets
- The distribution is skewed towards left: Negatively skewed.
- Peak was observed around Fwd Header Length=0.0
- There were no results for Fwd Header Length>0.0
- There are relatively smaller size bins of left hand side of the peak.
- On X-axis values lie in the range -2.0 to 0.0
Bwd Header Length: Length of header in backward packets
- Peak was observed around Bwd Header Length=0.0
- Most values are concenterated at the peak.
- There were few observations at Bwd Header Length=-1.75, -1, -0.6, -0.30
- There were no results for Bwd Header Length>0.0
- On X-axis values lie in the range -1.75 to 0.0
Fwd Packets/s: Forward packets per second
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Fwd Packets/s=0
- From Fwd Packets/s=0.0 to 1.5, the values are stacked to the right hand side of peak.
- There are relatively smaller peaks at Fwd Packets/s= 2.0, 3.0, 4.0
- There is a wide gap (no results) between Fwd Packets/s=3.0 and Fwd Packets/s=4.0
- Most values are concenterated between Fwd Packets/s=0.0 and Fwd Packets/s=1.5. Between this range the graph also resembles to J-shaped graph.
Bwd Packets/s: Backward packets per second
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Bwd Packets/s=0.0
- Most values are concenterated between Bwd Packets.s=0.0 and Bwd Packets/s=0.5. Between this range the graph also resembles to J-shaped graph.
- There are relatively smaller peaks at Bwd Packets/s=0.5, 1.0 and 2.0
- After Bwd Packets/s>=1.0, the bins are scattered and gaps were observed at irregular intervals on the x-axis.
Packet Length Max: Maximum length of packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Packet Length Max=0
- After the peak, there is signifcant decline in results between Packet Length Max>=0 and Packet Length Max<=10000.
- Between Packet Length Max=0 and Packet Length Max=10000, the graph also resembes to J-shaped graph.
- Between Packet Length Max=10000 to 26000, the results are significantly lower than Packet Length Max=0 to 10000.
- There were no results observed between Packet Length Max=26000 to 30000, 50000 to 60000.
- There are some results observed between Packet Length Max=30000 to 50000.
- There are small number of resuls observed for Packet Length Max>60000. This may indicate outlier in the data.
Packet Length Mean: Mean length of packets
- On X-axis, values lie in the range 0 to 17500.
- The distribution is a J-shaped graph.
- Peak was observed around Packet Length Mean=0
- All other bins are stacked against the peak on its right hand side.
- There is a constant decline of results as we move towards right side of the graph.
- The results are concenterated between Packet Length Mean>=0 and Packet Length Mean<5000.
- There is a small observation at Packet Length Mean=17500. This may indicate an outlier in the data.
Packet Length Std: Standard deviation length of packets
- The distribution is a J-shaped graph.
- Peak was observed around Packet Length Std=0
- All other bins are stacked against the peak on its right hand side.
- Most values are concenterated between Packet Length Std>=0 and Packet Length Std<=5000.
- There is an observation at Packet Length Std>20000. This may indicate an outlier in the data.
- On X-axis, values lie in the range 0 to 20000.
Packet Length Variance: Variance of length of packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed around Packet Length Variance=0
- Most values are concenterated between Packet Length Variance>=0 and Packet Length Variance<=1.
- There is an observation after long gap at Packet Length Variance>5. This may indicate an outlier in the data.
SYN Flag Count: Number of SYN flags
- Peak was observed at SYN Flag Count=0.
- Most values are concenterated at the peak.
- There are a few observations at SYN Flag Count=1.0. This may indicate outlier in the data.
URG Flag Count: Number of URG flags
- Peak was observed at URG Flag Count=0.
- Most values are concenterated at the peak.
- There are a few observations at URG Flag Count=1.0. This may indicate outlier in the data.
Avg Packet Size: Average packet size
- The distribution is J-shaped graph.
- Most of the values are stacked at left end and then it continuously declines as we move towards right hand side of the x-axis.
- Peak was observed at Avg Packet Size=0.
- Most values are concenterated between Avg Packet Size>=0 and Avg Packet Size<5000.
- There were some values afer a long gap between Avg Packet Size>5000 and Avg Packet Size <=17500. This may indicate outlier in the data.
Avg Fwd Segment Size: Average forward segment size
- The distribution is skewed towards right: Positively skewed.
- Peak was observed at Avg Fwd Segment Size=0.
- After the peak, there is consistent decline in results.
- Most values are concenterated between Avg Fwd Segment Size>=0 and Avg Fwd Segment Size<=5000.
- There were some values around Avg Fwd Sgement Size=7500.
- There were couple of values observed in range Avg Fwd Segment Size>10000 and Avg Fwd Segment Size<12500, Avg Fwd Segemnt Size>=15000. This may indicate outlier in the data.
Avg Bwd Segment Size: Average backward segment size
- The distribution is J-shaped graph.
- Most of the values are stacked at left end and then it continuously declines as we move towards right hand side of the x-axis.
- Peak was observed at Avg Bwd Segment Size=0.
- Most values are concenterated between Avg Bwd Segment Size>=0 and Avg Bwd Segment Size<=5000.
- There is a long gap observed after Avg Bwd Segment Size>5000.
- On extreme right end side of the graph, between Avg Bwd Segment Size>=30000 and Avg Bwd Segment Size<=35000, few values were observed. This may indicate outlier in the data.
Subflow Fwd Packets: Subflow forward packets
- Peak was observed at Subflow Fwd Packets=0.
- After the peak, there is significant decline in results up to Subflow Fwd Packets=50000.
- There is a plateau region observed between Subflow Fwd Packets>=100000 and Subflow Fwd Packets<=150000.
- There were decline in the number of results observed after Subflow Fwd Packets>=150000.
- There are many values between Subflow Fwd Packets>=50000 and Subflow Fwd Packets<=150000.
- There is a value after Subflow Fwd Packets>300000. This may indicate outlier in the data.
Subflow Fwd Bytes: Subflow forward bytes
- The distibution is J-shaped graph.
- Most of the values are stacked at left end and then it continuously declines as we move towards right hand side of the x-axis.
- Most values are concenterated between Subflow Fwd Bytes>=0 and Subflow Fwd Bytes<0.2
- There were couple of values observed around Subflow Fwd Bytes=0.4 and Subflow Fwd Bytes>1.4. This may indicate outlier in the data.
Subflow Bwd Packets: Subflow backward packets
- The distribution is skewed towards right: Positively skewed.
- Peak was observed at Subflow Bwd Packets=0.
- After the peak, there is consistent decline in results.
- Most values are concenterated between Subflow Bwd Packets>=0 and Subflow Bwd Packets<=50000.
- After Subflow Bwd Packets>50000, there are many small plateau regions at irregular gaps up to Subflow Bwd Packets<300000.
Subflow Bwd Bytes: Subflow backward bytes
- The distribution is skewed towards right: Positively skewed.
- Peak was observed at Subflow Bwd Bytes=0.
- Between Subflow Bwd Bytes>=0 and Subflow Bwd Bytes<=1, the graph appeared similar to J-shaped graph.
- Most values are concenterated between Subflow Bwd Bytes>=0 and Subflow Bwd Bytes<=1.
- There are some plateau regions on right hand side of the peak at irregular gaps.
- On the X-axis values lie in the range 0 to 7.
Init Fwd Win Bytes: Initial forward window size
- There are two large peaks at Init Fwd Win Bytes=0 and Init Fwd Win Bytes=10000.
- There are smaller peaks at Init Fwd Win Bytes=30000 and Init Fwd Win Bytes>60000.
- Between the peaks, the frequency of bins is relatively very less.
- There are no gaps in the results observed on X-axis of the graph.
- Since the graph has multiple peaks, we can also call it multi-modal.
- From broad overview, as we move from left to right hand side of the graph, the results decrease. But, due to tall peaks observed in between, we cannot conclude consistent decline of results.
Init Bwd Win Bytes: Initial backward window size
- There are three main peaks from overall observation of the graph: Init Bwd Win Bytes=0, 30000, 60000.
- The tallest peak was observed at Init Bwd Win Bytes=0, the second tallest was at Init Bwd Win Bytes=60000 and the smallest peak among the three was observed at Init Bwd Win Bytes=30000.
- Between the peaks, the frequency of bins is relatively very less.
- There are no gaps in the results observed on X-axis of the graph.
- Since the graph has multiple peaks, we can also call it multi-modal.
Fwd Act Data Packets: Forward packets with actual data
- Peak was observed at Fwd Act Data Packets=0.
- After the peak, there is significant decline in results up to Fwd Act Data Packets=50000.
- There is a plateau region observed between Fwd Act Data Packets>=100000 and Fwd Act Data Packets<=150000.
- There are some values observed after Fwd Act Data Packets>300000. This may indicate outlier in the data.
Fwd Seg Size Min: Minimum segment size in forward packets
- Peak was observed at Fwd Seg Size Min=0.0
- On the X-axis value lie in the range -1.4 to 0.0. Thus, the values on X-axis are all negative, we need to check the actual values under the column to determine if data is accurate or invalid.
- There are some values observed at Fwd Seg Size Min=-1.4, Fwd Seg Size Min>-1.2 and Fwd Seg Size Min<-1.0, Fwd Seg Size Min>-0.6 and Fwd Seg Size Min<-0.4
Active Mean: Mean active time
- The distribution is skewed towards right: Positively skewed.
- Peak was observed at Active Mean=0.
- After the peak, there is significant decline in results.
- There are two plateau regions observed at Active Mean=0.4 and Active Mean=0.6
- There are no gaps in the results observed on X-axis of the graph
Active Std: Standard deviation of active time
- The distribution is skewed towards right: Positively skewed.
- Peak was observed at Active Std=0.
- After the peak, there is consistent decline in results up to Active Std=3.
- There are two plateau regions observed between Active Std>=3 and Active Std<=4.
- There is decline in the results between Active Std>=4 and Active Std<=5,
- There is second plateau in the graph observed near Active Std=5.
- There is decline in the results after Active Std>5.
Active Max: Maximum active time
- The distribution is skewed towards right: Positively skewed.
- Peak was observed at Active Max=0.
- After the peak, there is consistent decline in results up to Active Max=0.6
- Around Active Max=0.6, there is a relatively smaller peak compared to main peak, and a plateau region of 2 bins around it.
- Similarly, around Active Max=0.8, there is a relatively smaller peak compared to main peak, and a plateay region of 2 bins around it.
- On X-axis values lie in the range 0 to 1.2
- There are no gaps in the results observed on X-axis of the graph.
Active Min: Minimum active time
- The distibution is skewed towards right: Positively skewed.
- Peak was observed at Active Min=0.
- There is relatively smaller peak at Active Min=0.8 and a plateau region around it.
- On X-axis value lie in the range 0 to 1.2
- There are no gaps in the results observed on X-axis of the graph.
Idle Mean: Mean idle time
- Peak was observed at Idle Mean=0.
- Most values are concenterated in bin represented by the peak.
- There are some values observed at Idle Mean=2.0, 3.0, 3.5, 4.0
- There are large gaps observed on X-axis of the graph after the peak.
Idle Std: Standard deviation of idle time
- Peak was observed at Idle Std=0.0
- Most values are concenterated in bin represented by the peak.
- There are some values observed at Idle Std=1.0, 1.5, 2.0 and 2.5
- There are large gaps observed on X-axis of the graph after the peak.
Idle Max: Maximum idle time
- Peak was observed at Idle Max=0.0
- Most values are concenterated in bin represented by the peak.
- There are some values observed at Idle Max=0.4, 0.6, 0.8, 1.0.
- There are large gaps observed on X-axis of the graph after the peak.
Idle Min: Minimum idle time
- Peak was observed at Idle Min=0.0
- Most values are concenterated in bin represented by the peak.
- There is a value observed after at Idle Min=2.5, which is after a large gap on X-axis. This may indicate outlier in the data.
# Plotting with normal scale (horizontal bar chart)
plt.figure(figsize=(12, 12))
cic_df['Label'].value_counts().plot(kind='barh')
plt.title('Label Distribution (Normal Scale)')
plt.show()
# Plotting with log scale (horizontal bar chart)
plt.figure(figsize=(12, 12))
cic_df['Label'].value_counts().plot(kind='barh', log=True)
plt.title('Label Distribution (Log Scale)')
plt.show()
# Plotting with normal scale
plt.figure(figsize=(12, 6))
cic_df['ClassLabel'].value_counts().plot(kind='bar')
plt.title('ClassLabel Distribution (Normal Scale)')
plt.xticks(rotation=0)
plt.show()
# Plotting with log scale
plt.figure(figsize=(12, 6))
cic_df['ClassLabel'].value_counts().plot(kind='bar', log=True)
plt.title('ClassLabel Distribution (Log Scale)')
plt.xticks(rotation=0)
plt.show()
#Summarizing the data
cic_df.describe(include='all').transpose()
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Flow Duration | 9167271.0 | NaN | NaN | NaN | 15907223.442477 | 656982607.269506 | -919011000000.0 | 11604.0 | 396803.0 | 5562536.0 | 120000000.0 |
| Total Fwd Packets | 9167271.0 | NaN | NaN | NaN | 40.796369 | 2066.318093 | 0.0 | 2.0 | 3.0 | 7.0 | 309629.0 |
| Total Backward Packets | 9167271.0 | NaN | NaN | NaN | 9.505533 | 580.575061 | 0.0 | 1.0 | 2.0 | 5.0 | 291922.0 |
| Fwd Packets Length Total | 9167271.0 | NaN | NaN | NaN | 2063.895115 | 83587.211939 | 0.0 | 29.0 | 97.0 | 935.0 | 144391846.0 |
| Bwd Packets Length Total | 9167271.0 | NaN | NaN | NaN | 10011.181786 | 1281318.910198 | 0.0 | 0.0 | 232.0 | 964.0 | 655453030.0 |
| Fwd Packet Length Max | 9167271.0 | NaN | NaN | NaN | 294.705839 | 501.859251 | 0.0 | 20.0 | 55.0 | 507.0 | 64440.0 |
| Fwd Packet Length Mean | 9167271.0 | NaN | NaN | NaN | 81.27935 | 142.242706 | 0.0 | 7.0 | 44.0 | 107.666664 | 16529.314453 |
| Fwd Packet Length Std | 9167271.0 | NaN | NaN | NaN | 104.328316 | 198.990021 | 0.0 | 0.0 | 11.547006 | 180.710632 | 18401.582031 |
| Bwd Packet Length Max | 9167271.0 | NaN | NaN | NaN | 607.085739 | 1180.5395 | 0.0 | 0.0 | 152.0 | 964.0 | 65160.0 |
| Bwd Packet Length Mean | 9167271.0 | NaN | NaN | NaN | 200.295654 | 379.298279 | 0.0 | 0.0 | 108.0 | 216.375 | 33879.285156 |
| Bwd Packet Length Std | 9167271.0 | NaN | NaN | NaN | 240.994995 | 504.050537 | 0.0 | 0.0 | 0.0 | 405.464783 | 21326.238281 |
| Flow Bytes/s | 9167271.0 | NaN | NaN | NaN | 2854904.490479 | 63544921.718519 | -261000000.0 | 55.596724 | 993.774576 | 27169.700651 | 2944000000.0 |
| Flow Packets/s | 9167271.0 | NaN | NaN | NaN | 10998.516442 | 103814.481722 | -2000000.0 | 1.465227 | 16.189285 | 497.945973 | 4000000.0 |
| Flow IAT Mean | 9167271.0 | NaN | NaN | NaN | 4577766.0 | 296521888.0 | -828219981824.0 | 2579.5 | 82560.335938 | 788482.8125 | 120000000.0 |
| Flow IAT Std | 9167271.0 | NaN | NaN | NaN | 2389986.25 | 449936384.0 | 0.0 | 0.0 | 18573.603516 | 836723.1875 | 474354483200.0 |
| Flow IAT Max | 9167271.0 | NaN | NaN | NaN | 10513515.967775 | 877591030.44273 | -828220000000.0 | 10593.0 | 223837.0 | 5109311.5 | 979781000000.0 |
| Flow IAT Min | 9167271.0 | NaN | NaN | NaN | 2752844.970628 | 994810251.425043 | -947405000000.0 | 3.0 | 14.0 | 470.0 | 120000000.0 |
| Fwd IAT Total | 9167271.0 | NaN | NaN | NaN | 15304688.663146 | 656974431.36816 | -919011000000.0 | 283.0 | 71923.0 | 4712353.0 | 120000000.0 |
| Fwd IAT Mean | 9167271.0 | NaN | NaN | NaN | 5135515.5 | 296550368.0 | -828219981824.0 | 135.0 | 28517.800781 | 1074626.625 | 120000000.0 |
| Fwd IAT Std | 9167271.0 | NaN | NaN | NaN | 2542754.0 | 449947776.0 | 0.0 | 0.0 | 454.427399 | 399539.28125 | 474354483200.0 |
| Fwd IAT Max | 9167271.0 | NaN | NaN | NaN | 10060776.647337 | 877585868.180635 | -828220000000.0 | 204.0 | 61619.0 | 4226764.0 | 979781000000.0 |
| Fwd IAT Min | 9167271.0 | NaN | NaN | NaN | 3002675.727242 | 994818677.362318 | -947405000000.0 | 2.0 | 36.0 | 455.0 | 120000000.0 |
| Bwd IAT Total | 9167271.0 | NaN | NaN | NaN | 9415138.051235 | 28114788.410477 | 0.0 | 0.0 | 731.0 | 1252558.5 | 120000000.0 |
| Bwd IAT Mean | 9167271.0 | NaN | NaN | NaN | 1223600.375 | 6206843.5 | 0.0 | 0.0 | 646.0 | 263630.15625 | 120000000.0 |
| Bwd IAT Std | 9167271.0 | NaN | NaN | NaN | 1224655.75 | 4738748.5 | 0.0 | 0.0 | 0.0 | 282084.390625 | 84835320.0 |
| Bwd IAT Max | 9167271.0 | NaN | NaN | NaN | 3654958.646934 | 13277469.581051 | 0.0 | 0.0 | 708.0 | 953075.0 | 120000000.0 |
| Bwd IAT Min | 9167271.0 | NaN | NaN | NaN | 482945.861238 | 5587847.968851 | 0.0 | 0.0 | 3.0 | 305.0 | 120000000.0 |
| Fwd PSH Flags | 9167271.0 | NaN | NaN | NaN | 0.031361 | 0.174291 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| Fwd Header Length | 9167271.0 | NaN | NaN | NaN | -8642949.045872 | 926438618.023802 | -212543795000.0 | 40.0 | 72.0 | 168.0 | 134480904.0 |
| Bwd Header Length | 9167271.0 | NaN | NaN | NaN | -54198.682322 | 12840228.879468 | -17003494240.0 | 8.0 | 60.0 | 136.0 | 5838440.0 |
| Fwd Packets/s | 9167271.0 | NaN | NaN | NaN | 9276.251953 | 99356.585938 | 0.0 | 0.875042 | 8.615973 | 269.56601 | 4000000.0 |
| Bwd Packets/s | 9167271.0 | NaN | NaN | NaN | 1739.180176 | 18565.78125 | 0.0 | 0.140176 | 3.323374 | 77.047539 | 2000000.0 |
| Packet Length Max | 9167271.0 | NaN | NaN | NaN | 708.834548 | 1220.914561 | 0.0 | 46.0 | 232.0 | 964.0 | 65160.0 |
| Packet Length Mean | 9167271.0 | NaN | NaN | NaN | 142.053604 | 209.506439 | 0.0 | 30.75 | 78.666664 | 155.375 | 17344.984375 |
| Packet Length Std | 9167271.0 | NaN | NaN | NaN | 220.316681 | 383.539764 | 0.0 | 8.763561 | 73.900833 | 319.470306 | 22788.287109 |
| Packet Length Variance | 9167271.0 | NaN | NaN | NaN | 195633.8125 | 957366.4375 | 0.0 | 76.800003 | 5461.333496 | 102061.273438 | 519000000.0 |
| SYN Flag Count | 9167271.0 | NaN | NaN | NaN | 0.04044 | 0.19699 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| URG Flag Count | 9167271.0 | NaN | NaN | NaN | 0.036218 | 0.186833 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| Avg Packet Size | 9167271.0 | NaN | NaN | NaN | 159.688553 | 229.775696 | 0.0 | 41.0 | 99.5 | 174.0 | 17478.408203 |
| Avg Fwd Segment Size | 9167271.0 | NaN | NaN | NaN | 81.27935 | 142.242706 | 0.0 | 7.0 | 44.0 | 107.666664 | 16529.314453 |
| Avg Bwd Segment Size | 9167271.0 | NaN | NaN | NaN | 200.295654 | 379.298279 | 0.0 | 0.0 | 108.0 | 216.375 | 33879.285156 |
| Subflow Fwd Packets | 9167271.0 | NaN | NaN | NaN | 40.796369 | 2066.318093 | 0.0 | 2.0 | 3.0 | 7.0 | 309629.0 |
| Subflow Fwd Bytes | 9167271.0 | NaN | NaN | NaN | 2063.891879 | 83586.713236 | 0.0 | 29.0 | 97.0 | 935.0 | 144391846.0 |
| Subflow Bwd Packets | 9167271.0 | NaN | NaN | NaN | 9.505533 | 580.575061 | 0.0 | 1.0 | 2.0 | 5.0 | 291922.0 |
| Subflow Bwd Bytes | 9167271.0 | NaN | NaN | NaN | 10011.036974 | 1281298.795172 | 0.0 | 0.0 | 232.0 | 964.0 | 655453030.0 |
| Init Fwd Win Bytes | 9167271.0 | NaN | NaN | NaN | 10559.004783 | 18924.951881 | -1.0 | -1.0 | 2049.0 | 8192.0 | 65535.0 |
| Init Bwd Win Bytes | 9167271.0 | NaN | NaN | NaN | 8373.292812 | 19433.923807 | -1.0 | -1.0 | 123.0 | 259.0 | 65535.0 |
| Fwd Act Data Packets | 9167271.0 | NaN | NaN | NaN | 36.34433 | 2053.127215 | 0.0 | 0.0 | 1.0 | 4.0 | 309628.0 |
| Fwd Seg Size Min | 9167271.0 | NaN | NaN | NaN | -1071545.079784 | 33726253.943877 | -1408237563.0 | 20.0 | 20.0 | 20.0 | 67240452.0 |
| Active Mean | 9167271.0 | NaN | NaN | NaN | 116771.984375 | 1476716.125 | 0.0 | 0.0 | 0.0 | 0.0 | 114000000.0 |
| Active Std | 9167271.0 | NaN | NaN | NaN | 55209.441406 | 854685.3125 | 0.0 | 0.0 | 0.0 | 0.0 | 74953352.0 |
| Active Max | 9167271.0 | NaN | NaN | NaN | 193040.412274 | 1981967.859903 | 0.0 | 0.0 | 0.0 | 0.0 | 114000000.0 |
| Active Min | 9167271.0 | NaN | NaN | NaN | 82811.023526 | 1272059.803775 | 0.0 | 0.0 | 0.0 | 0.0 | 114000000.0 |
| Idle Mean | 9167271.0 | NaN | NaN | NaN | 8015579.0 | 350376032.0 | 0.0 | 0.0 | 0.0 | 0.0 | 395571429376.0 |
| Idle Std | 9167271.0 | NaN | NaN | NaN | 549194.4375 | 225147856.0 | 0.0 | 0.0 | 0.0 | 0.0 | 262247858176.0 |
| Idle Max | 9167271.0 | NaN | NaN | NaN | 8775394.133697 | 832169632.699721 | 0.0 | 0.0 | 0.0 | 0.0 | 979781000000.0 |
| Idle Min | 9167271.0 | NaN | NaN | NaN | 7413830.334114 | 84632298.040205 | 0.0 | 0.0 | 0.0 | 0.0 | 239934000000.0 |
| Label | 9167271 | 33 | Benign | 7185881 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ClassLabel | 9167271 | 8 | Benign | 7185881 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Computing proportion of negative values among the features where the negative values were observed
negative_proportion=[]
features_with_negative_values=["Flow Duration","Flow Bytes/s","Flow Packets/s","Flow IAT Mean","Flow IAT Max","Flow IAT Min","Fwd IAT Total",
"Fwd IAT Mean","Fwd IAT Max","Fwd IAT Min","Fwd Header Length","Bwd Header Length","Init Fwd Win Bytes",
"Init Bwd Win Bytes", "Fwd Seg Size Min"]
for feature in features_with_negative_values:
negative_count=(cic_df[feature]<0).sum()
negative_Proportion=negative_count*100/9167271
negative_proportion.append((feature, negative_count, negative_Proportion))
negative_proportion_df=pd.DataFrame(negative_proportion, columns=["Feature name","Number of negative values", "Percentage of negative values"])
print(negative_proportion_df)
Feature name Number of negative values \
0 Flow Duration 96
1 Flow Bytes/s 53
2 Flow Packets/s 96
3 Flow IAT Mean 96
4 Flow IAT Max 85
5 Flow IAT Min 2816
6 Fwd IAT Total 14
7 Fwd IAT Mean 14
8 Fwd IAT Max 3
9 Fwd IAT Min 32
10 Fwd Header Length 50907
11 Bwd Header Length 257
12 Init Fwd Win Bytes 2658779
13 Init Bwd Win Bytes 3766283
14 Fwd Seg Size Min 74142
Percentage of negative values
0 0.001047
1 0.000578
2 0.001047
3 0.001047
4 0.000927
5 0.030718
6 0.000153
7 0.000153
8 0.000033
9 0.000349
10 0.555312
11 0.002803
12 29.002950
13 41.084015
14 0.808768
#Fetching statistical summary for features with negative values
cic_df[features_with_negative_values].describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Flow Duration | 9167271.0 | 1.590722e+07 | 6.569826e+08 | -9.190110e+11 | 11604.000000 | 396803.000000 | 5.562536e+06 | 1.200000e+08 |
| Flow Bytes/s | 9167271.0 | 2.854904e+06 | 6.354492e+07 | -2.610000e+08 | 55.596724 | 993.774576 | 2.716970e+04 | 2.944000e+09 |
| Flow Packets/s | 9167271.0 | 1.099852e+04 | 1.038145e+05 | -2.000000e+06 | 1.465227 | 16.189285 | 4.979460e+02 | 4.000000e+06 |
| Flow IAT Mean | 9167271.0 | 4.577766e+06 | 2.965219e+08 | -8.282200e+11 | 2579.500000 | 82560.335938 | 7.884828e+05 | 1.200000e+08 |
| Flow IAT Max | 9167271.0 | 1.051352e+07 | 8.775910e+08 | -8.282200e+11 | 10593.000000 | 223837.000000 | 5.109312e+06 | 9.797810e+11 |
| Flow IAT Min | 9167271.0 | 2.752845e+06 | 9.948103e+08 | -9.474050e+11 | 3.000000 | 14.000000 | 4.700000e+02 | 1.200000e+08 |
| Fwd IAT Total | 9167271.0 | 1.530469e+07 | 6.569744e+08 | -9.190110e+11 | 283.000000 | 71923.000000 | 4.712353e+06 | 1.200000e+08 |
| Fwd IAT Mean | 9167271.0 | 5.135516e+06 | 2.965504e+08 | -8.282200e+11 | 135.000000 | 28517.800781 | 1.074627e+06 | 1.200000e+08 |
| Fwd IAT Max | 9167271.0 | 1.006078e+07 | 8.775859e+08 | -8.282200e+11 | 204.000000 | 61619.000000 | 4.226764e+06 | 9.797810e+11 |
| Fwd IAT Min | 9167271.0 | 3.002676e+06 | 9.948187e+08 | -9.474050e+11 | 2.000000 | 36.000000 | 4.550000e+02 | 1.200000e+08 |
| Fwd Header Length | 9167271.0 | -8.642949e+06 | 9.264386e+08 | -2.125438e+11 | 40.000000 | 72.000000 | 1.680000e+02 | 1.344809e+08 |
| Bwd Header Length | 9167271.0 | -5.419868e+04 | 1.284023e+07 | -1.700349e+10 | 8.000000 | 60.000000 | 1.360000e+02 | 5.838440e+06 |
| Init Fwd Win Bytes | 9167271.0 | 1.055900e+04 | 1.892495e+04 | -1.000000e+00 | -1.000000 | 2049.000000 | 8.192000e+03 | 6.553500e+04 |
| Init Bwd Win Bytes | 9167271.0 | 8.373293e+03 | 1.943392e+04 | -1.000000e+00 | -1.000000 | 123.000000 | 2.590000e+02 | 6.553500e+04 |
| Fwd Seg Size Min | 9167271.0 | -1.071545e+06 | 3.372625e+07 | -1.408238e+09 | 20.000000 | 20.000000 | 2.000000e+01 | 6.724045e+07 |
#Removing Init Fwd Win Bytes from the list of negative values
features_with_negative_values.remove("Init Fwd Win Bytes")
features_with_negative_values.remove("Init Bwd Win Bytes")
print("List of updated list of features with negative values: ",features_with_negative_values)
List of updated list of features with negative values: ['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Seg Size Min']
#Imputing all negative values with their feature's respective median except for Init Fwd Win Bytes, Init Bwd Win Bytes
for c in features_with_negative_values:
median_value = cic_df[c][cic_df[c] >= 0].median()
cic_df[c] = cic_df[c].apply(lambda x: median_value if x < 0 else x)
#Fetching statistical summary after performing imputation
cic_df[features_with_negative_values].describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Flow Duration | 9167271.0 | 1.657217e+07 | 3.407960e+07 | 1.000000 | 11605.000000 | 396839.000000 | 5.562536e+06 | 1.200000e+08 |
| Flow Bytes/s | 9167271.0 | 2.855014e+06 | 6.354482e+07 | 0.000000 | 55.599289 | 993.799601 | 2.716970e+04 | 2.944000e+09 |
| Flow Packets/s | 9167271.0 | 1.101542e+04 | 1.036516e+05 | 0.016667 | 1.465322 | 16.190865 | 4.979460e+02 | 4.000000e+06 |
| Flow IAT Mean | 9167271.0 | 4.717615e+06 | 1.564841e+07 | 0.333333 | 2580.000000 | 82566.500000 | 7.884828e+05 | 1.200000e+08 |
| Flow IAT Max | 9167271.0 | 1.062255e+07 | 8.321956e+08 | 1.000000 | 10595.000000 | 223853.000000 | 5.109312e+06 | 9.797810e+11 |
| Flow IAT Min | 9167271.0 | 3.915124e+06 | 1.560000e+07 | 0.000000 | 3.000000 | 14.000000 | 4.700000e+02 | 1.200000e+08 |
| Fwd IAT Total | 9167271.0 | 1.596963e+07 | 3.393343e+07 | 0.000000 | 283.000000 | 71924.000000 | 4.712353e+06 | 1.200000e+08 |
| Fwd IAT Mean | 9167271.0 | 5.275366e+06 | 1.617462e+07 | 0.000000 | 135.000000 | 28518.000000 | 1.074627e+06 | 1.200000e+08 |
| Fwd IAT Max | 9167271.0 | 1.016981e+07 | 8.321902e+08 | 0.000000 | 204.000000 | 61619.000000 | 4.226764e+06 | 9.797810e+11 |
| Fwd IAT Min | 9167271.0 | 4.164954e+06 | 1.611036e+07 | 0.000000 | 2.000000 | 36.000000 | 4.550000e+02 | 1.200000e+08 |
| Fwd Header Length | 9167271.0 | 4.675070e+02 | 4.810506e+04 | 0.000000 | 40.000000 | 72.000000 | 1.680000e+02 | 1.344809e+08 |
| Bwd Header Length | 9167271.0 | 2.119982e+02 | 1.162439e+04 | 0.000000 | 8.000000 | 60.000000 | 1.360000e+02 | 5.838440e+06 |
| Fwd Seg Size Min | 9167271.0 | 2.814285e+01 | 2.220807e+04 | 0.000000 | 20.000000 | 20.000000 | 2.000000e+01 | 6.724045e+07 |
We have successfully handled negative values among 13 features.
We have two more features with substantial number of negative values: -
- Init Fwd Win Bytes : 29% values are negative
- Init Bwd Win Bytes : 41% values are negative
If we drop the rows with negative values among the two columns, we will lose massive volume of information from other features in the dataset.
If we drop the two columns, we will lose out the data from those columns having positive values and which may be important.
#Fetching the count of target feature Label and ClassLabel when Init Fwd Win Bytes have negative values
negative_init_fwd_win_bytes=cic_df[cic_df['Init Fwd Win Bytes']<0]
print("\nLabel: \n",negative_init_fwd_win_bytes['Label'].value_counts())
print("\nClassLabel: \n",negative_init_fwd_win_bytes['ClassLabel'].value_counts())
Label: Label Benign 2340737 DDoS-NTP 120966 DDoS-TFTP 98611 Infiltration 31750 DDoS-UDP 28855 DDoS-MSSQL 11779 DDoS-Ddossim 5115 DDoS-DNS 3662 DoS-Slowread 2786 DDoS-LDAP 2085 DDoS-SNMP 2013 DDoS-UDPLag 1985 DDoS-Slowloris 1858 DoS-Slowheaders 1649 DDoS-LOIC-HTTP 797 DoS-Hulk 747 DoS-Rudy 699 DDoS-NetBIOS 668 DoS-Goldeneye 632 DoS-Slowbody 621 Portscan 290 Botnet 258 Webattack-bruteforce 145 DDoS-Syn 67 Webattack-XSS 4 Name: count, dtype: int64 ClassLabel: ClassLabel Benign 2340737 DDoS 278461 Infiltration 31750 DoS 7134 Portscan 290 Botnet 258 Webattack 149 Name: count, dtype: int64
#Fetching the count of target feature Label and ClassLabel when Init Bwd Win Bytes have negative values
negative_init_fwd_win_bytes=cic_df[cic_df['Init Bwd Win Bytes']<0]
print("\nLabel: \n",negative_init_fwd_win_bytes['Label'].value_counts())
print("\nClassLabel: \n",negative_init_fwd_win_bytes['ClassLabel'].value_counts())
Label: Label Benign 2916889 DDoS-LOIC-HTTP 286075 DoS-Hulk 128830 DDoS-NTP 120979 DDoS-TFTP 98678 DDoS 46524 Infiltration 42377 DDoS-HOIC 35111 DDoS-UDP 28855 DoS-Goldeneye 16206 DDoS-MSSQL 11780 DDoS-Syn 9716 DDoS-UDPLag 6436 DoS-Slowloris 3873 DDoS-DNS 3662 DoS-Slowhttptest 3145 DDoS-LDAP 2086 DDoS-SNMP 2013 Botnet 1520 DDoS-NetBIOS 668 Portscan 352 Webattack-bruteforce 280 Webattack-XSS 128 Bruteforce-FTP 63 Webattack-SQLi 20 Bruteforce-SSH 17 Name: count, dtype: int64 ClassLabel: ClassLabel Benign 2916889 DDoS 652583 DoS 152054 Infiltration 42377 Botnet 1520 Webattack 428 Portscan 352 Bruteforce 80 Name: count, dtype: int64
- At present, the dataset is massively imbalanced with 78% records classifified as Benign and 22% records classifified as Malicious.
- Based on the results obtained above by fetching the counts for "negative values" of the two features we observed: -
- Init Fwd Win Bytes: 88% records are Benign and 12% records are Malicious.
- Init Bwd Win Bytes: 78% records are Benign and 22% records are Malicious.
- Thus, the negative values for the two features do not give any different characteristic of events when compared with characteristics of the complete dataset.
- Moreover, both features by definition cannot have negative values. Thus, it indicates data quality issues in those records.
- We can perform prediction of data for the two features having negative values, however, due to constraint of time we will not adopt that approach.
- As the result, we shall perform imputation using the respective median values.
for c in ['Init Fwd Win Bytes','Init Bwd Win Bytes']:
median_value = cic_df[c][cic_df[c] >= 0].median()
cic_df[c] = cic_df[c].apply(lambda x: median_value if x < 0 else x)
#Fetching statistical summary after performing imputation
cic_df.describe(include='all').transpose()
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Flow Duration | 9167271.0 | NaN | NaN | NaN | 16572166.706035 | 34079602.158443 | 1.0 | 11605.0 | 396839.0 | 5562536.0 | 120000000.0 |
| Total Fwd Packets | 9167271.0 | NaN | NaN | NaN | 40.796369 | 2066.318093 | 0.0 | 2.0 | 3.0 | 7.0 | 309629.0 |
| Total Backward Packets | 9167271.0 | NaN | NaN | NaN | 9.505533 | 580.575061 | 0.0 | 1.0 | 2.0 | 5.0 | 291922.0 |
| Fwd Packets Length Total | 9167271.0 | NaN | NaN | NaN | 2063.895115 | 83587.211939 | 0.0 | 29.0 | 97.0 | 935.0 | 144391846.0 |
| Bwd Packets Length Total | 9167271.0 | NaN | NaN | NaN | 10011.181786 | 1281318.910198 | 0.0 | 0.0 | 232.0 | 964.0 | 655453030.0 |
| Fwd Packet Length Max | 9167271.0 | NaN | NaN | NaN | 294.705839 | 501.859251 | 0.0 | 20.0 | 55.0 | 507.0 | 64440.0 |
| Fwd Packet Length Mean | 9167271.0 | NaN | NaN | NaN | 81.27935 | 142.242706 | 0.0 | 7.0 | 44.0 | 107.666664 | 16529.314453 |
| Fwd Packet Length Std | 9167271.0 | NaN | NaN | NaN | 104.328316 | 198.990021 | 0.0 | 0.0 | 11.547006 | 180.710632 | 18401.582031 |
| Bwd Packet Length Max | 9167271.0 | NaN | NaN | NaN | 607.085739 | 1180.5395 | 0.0 | 0.0 | 152.0 | 964.0 | 65160.0 |
| Bwd Packet Length Mean | 9167271.0 | NaN | NaN | NaN | 200.295654 | 379.298279 | 0.0 | 0.0 | 108.0 | 216.375 | 33879.285156 |
| Bwd Packet Length Std | 9167271.0 | NaN | NaN | NaN | 240.994995 | 504.050537 | 0.0 | 0.0 | 0.0 | 405.464783 | 21326.238281 |
| Flow Bytes/s | 9167271.0 | NaN | NaN | NaN | 2855014.466597 | 63544820.766049 | 0.0 | 55.599289 | 993.799601 | 27169.700651 | 2944000000.0 |
| Flow Packets/s | 9167271.0 | NaN | NaN | NaN | 11015.42319 | 103651.622741 | 0.016667 | 1.465322 | 16.190865 | 497.945973 | 4000000.0 |
| Flow IAT Mean | 9167271.0 | NaN | NaN | NaN | 4717614.943425 | 15648408.216738 | 0.333333 | 2580.0 | 82566.5 | 788482.8125 | 120000000.0 |
| Flow IAT Std | 9167271.0 | NaN | NaN | NaN | 2389986.25 | 449936384.0 | 0.0 | 0.0 | 18573.603516 | 836723.1875 | 474354483200.0 |
| Flow IAT Max | 9167271.0 | NaN | NaN | NaN | 10622550.600614 | 832195631.034822 | 1.0 | 10595.0 | 223853.0 | 5109311.5 | 979781000000.0 |
| Flow IAT Min | 9167271.0 | NaN | NaN | NaN | 3915123.586074 | 15599995.873778 | 0.0 | 3.0 | 14.0 | 470.0 | 120000000.0 |
| Fwd IAT Total | 9167271.0 | NaN | NaN | NaN | 15969627.880819 | 33933433.81845 | 0.0 | 283.0 | 71924.0 | 4712353.0 | 120000000.0 |
| Fwd IAT Mean | 9167271.0 | NaN | NaN | NaN | 5275366.172166 | 16174616.572548 | 0.0 | 135.0 | 28518.0 | 1074626.625 | 120000000.0 |
| Fwd IAT Std | 9167271.0 | NaN | NaN | NaN | 2542754.0 | 449947776.0 | 0.0 | 0.0 | 454.427399 | 399539.28125 | 474354483200.0 |
| Fwd IAT Max | 9167271.0 | NaN | NaN | NaN | 10169809.224737 | 832190246.518281 | 0.0 | 204.0 | 61619.0 | 4226764.0 | 979781000000.0 |
| Fwd IAT Min | 9167271.0 | NaN | NaN | NaN | 4164954.338113 | 16110357.348586 | 0.0 | 2.0 | 36.0 | 455.0 | 120000000.0 |
| Bwd IAT Total | 9167271.0 | NaN | NaN | NaN | 9415138.051235 | 28114788.410477 | 0.0 | 0.0 | 731.0 | 1252558.5 | 120000000.0 |
| Bwd IAT Mean | 9167271.0 | NaN | NaN | NaN | 1223600.375 | 6206843.5 | 0.0 | 0.0 | 646.0 | 263630.15625 | 120000000.0 |
| Bwd IAT Std | 9167271.0 | NaN | NaN | NaN | 1224655.75 | 4738748.5 | 0.0 | 0.0 | 0.0 | 282084.390625 | 84835320.0 |
| Bwd IAT Max | 9167271.0 | NaN | NaN | NaN | 3654958.646934 | 13277469.581051 | 0.0 | 0.0 | 708.0 | 953075.0 | 120000000.0 |
| Bwd IAT Min | 9167271.0 | NaN | NaN | NaN | 482945.861238 | 5587847.968851 | 0.0 | 0.0 | 3.0 | 305.0 | 120000000.0 |
| Fwd PSH Flags | 9167271.0 | NaN | NaN | NaN | 0.031361 | 0.174291 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| Fwd Header Length | 9167271.0 | NaN | NaN | NaN | 467.506976 | 48105.058152 | 0.0 | 40.0 | 72.0 | 168.0 | 134480904.0 |
| Bwd Header Length | 9167271.0 | NaN | NaN | NaN | 211.998229 | 11624.394465 | 0.0 | 8.0 | 60.0 | 136.0 | 5838440.0 |
| Fwd Packets/s | 9167271.0 | NaN | NaN | NaN | 9276.251953 | 99356.585938 | 0.0 | 0.875042 | 8.615973 | 269.56601 | 4000000.0 |
| Bwd Packets/s | 9167271.0 | NaN | NaN | NaN | 1739.180176 | 18565.78125 | 0.0 | 0.140176 | 3.323374 | 77.047539 | 2000000.0 |
| Packet Length Max | 9167271.0 | NaN | NaN | NaN | 708.834548 | 1220.914561 | 0.0 | 46.0 | 232.0 | 964.0 | 65160.0 |
| Packet Length Mean | 9167271.0 | NaN | NaN | NaN | 142.053604 | 209.506439 | 0.0 | 30.75 | 78.666664 | 155.375 | 17344.984375 |
| Packet Length Std | 9167271.0 | NaN | NaN | NaN | 220.316681 | 383.539764 | 0.0 | 8.763561 | 73.900833 | 319.470306 | 22788.287109 |
| Packet Length Variance | 9167271.0 | NaN | NaN | NaN | 195633.8125 | 957366.4375 | 0.0 | 76.800003 | 5461.333496 | 102061.273438 | 519000000.0 |
| SYN Flag Count | 9167271.0 | NaN | NaN | NaN | 0.04044 | 0.19699 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| URG Flag Count | 9167271.0 | NaN | NaN | NaN | 0.036218 | 0.186833 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| Avg Packet Size | 9167271.0 | NaN | NaN | NaN | 159.688553 | 229.775696 | 0.0 | 41.0 | 99.5 | 174.0 | 17478.408203 |
| Avg Fwd Segment Size | 9167271.0 | NaN | NaN | NaN | 81.27935 | 142.242706 | 0.0 | 7.0 | 44.0 | 107.666664 | 16529.314453 |
| Avg Bwd Segment Size | 9167271.0 | NaN | NaN | NaN | 200.295654 | 379.298279 | 0.0 | 0.0 | 108.0 | 216.375 | 33879.285156 |
| Subflow Fwd Packets | 9167271.0 | NaN | NaN | NaN | 40.796369 | 2066.318093 | 0.0 | 2.0 | 3.0 | 7.0 | 309629.0 |
| Subflow Fwd Bytes | 9167271.0 | NaN | NaN | NaN | 2063.891879 | 83586.713236 | 0.0 | 29.0 | 97.0 | 935.0 | 144391846.0 |
| Subflow Bwd Packets | 9167271.0 | NaN | NaN | NaN | 9.505533 | 580.575061 | 0.0 | 1.0 | 2.0 | 5.0 | 291922.0 |
| Subflow Bwd Bytes | 9167271.0 | NaN | NaN | NaN | 10011.036974 | 1281298.795172 | 0.0 | 0.0 | 232.0 | 964.0 | 655453030.0 |
| Init Fwd Win Bytes | 9167271.0 | NaN | NaN | NaN | 12935.216454 | 17938.509047 | 0.0 | 8192.0 | 8192.0 | 8192.0 | 65535.0 |
| Init Bwd Win Bytes | 9167271.0 | NaN | NaN | NaN | 8470.251087 | 19392.445994 | 0.0 | 219.0 | 235.0 | 259.0 | 65535.0 |
| Fwd Act Data Packets | 9167271.0 | NaN | NaN | NaN | 36.34433 | 2053.127215 | 0.0 | 0.0 | 1.0 | 4.0 | 309628.0 |
| Fwd Seg Size Min | 9167271.0 | NaN | NaN | NaN | 28.142852 | 22208.067019 | 0.0 | 20.0 | 20.0 | 20.0 | 67240452.0 |
| Active Mean | 9167271.0 | NaN | NaN | NaN | 116771.984375 | 1476716.125 | 0.0 | 0.0 | 0.0 | 0.0 | 114000000.0 |
| Active Std | 9167271.0 | NaN | NaN | NaN | 55209.441406 | 854685.3125 | 0.0 | 0.0 | 0.0 | 0.0 | 74953352.0 |
| Active Max | 9167271.0 | NaN | NaN | NaN | 193040.412274 | 1981967.859903 | 0.0 | 0.0 | 0.0 | 0.0 | 114000000.0 |
| Active Min | 9167271.0 | NaN | NaN | NaN | 82811.023526 | 1272059.803775 | 0.0 | 0.0 | 0.0 | 0.0 | 114000000.0 |
| Idle Mean | 9167271.0 | NaN | NaN | NaN | 8015579.0 | 350376032.0 | 0.0 | 0.0 | 0.0 | 0.0 | 395571429376.0 |
| Idle Std | 9167271.0 | NaN | NaN | NaN | 549194.4375 | 225147856.0 | 0.0 | 0.0 | 0.0 | 0.0 | 262247858176.0 |
| Idle Max | 9167271.0 | NaN | NaN | NaN | 8775394.133697 | 832169632.699721 | 0.0 | 0.0 | 0.0 | 0.0 | 979781000000.0 |
| Idle Min | 9167271.0 | NaN | NaN | NaN | 7413830.334114 | 84632298.040205 | 0.0 | 0.0 | 0.0 | 0.0 | 239934000000.0 |
| Label | 9167271 | 33 | Benign | 7185881 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ClassLabel | 9167271 | 8 | Benign | 7185881 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
#Creating a new feature: isMalicious : Yes=1 , No=0
cic_df['isMalicious']=np.where(cic_df['ClassLabel']!='Benign', 1, 0)
cic_df.head(10)
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Active Std | Active Max | Active Min | Idle Mean | Idle Std | Idle Max | Idle Min | Label | ClassLabel | isMalicious | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.0 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 1 | 1.0 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 2 | 3.0 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 3 | 1.0 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 4 | 609.0 | 7 | 4 | 484.0 | 414.0 | 233.0 | 69.142860 | 111.967896 | 207.0 | 103.5 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 5 | 879.0 | 9 | 4 | 656.0 | 3064.0 | 313.0 | 72.888885 | 136.153809 | 1532.0 | 766.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 6 | 1160.0 | 9 | 6 | 3134.0 | 3048.0 | 1552.0 | 348.222229 | 682.482544 | 1518.0 | 508.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 7 | 524.0 | 7 | 4 | 2812.0 | 2820.0 | 1397.0 | 401.714294 | 679.914856 | 1410.0 | 705.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 8 | 6.0 | 1 | 1 | 6.0 | 6.0 | 6.0 | 6.000000 | 0.000000 | 6.0 | 6.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
| 9 | 1119.0 | 9 | 6 | 3160.0 | 3060.0 | 1565.0 | 351.111115 | 688.214966 | 1524.0 | 510.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | Benign | 0 |
10 rows × 60 columns
cic_df['isMalicious'].value_counts()
isMalicious 0 7185881 1 1981390 Name: count, dtype: int64
cic_df['ClassLabel'].value_counts()
ClassLabel Benign 7185881 DDoS 1234727 DoS 397344 Botnet 145968 Bruteforce 103244 Infiltration 94857 Webattack 2995 Portscan 2255 Name: count, dtype: int64
cic_df.shape
(9167271, 60)
cic_df=cic_df.drop(['Label'],axis=1)
cic_df.shape
(9167271, 59)
We have created a new feature: isMalicious, which is binary. We will use this for Binary classification.
We will use the feature: ClassLabel for Multi-class classification, to determine the type of attack.
We have dropped the feature: Label, because it gives futher sub-type of the attack, which will not be in the scope of our work.
As the result, our two target features in the dataset are: -
- isMalicious: For binary classification
- ClassLabel: For multi-class classification
Since the original dataset is too large, carrying out analysis over the complete dataset leads to over utilization of system's memory and the notebook stalls to work.
As the result, we will create a sample of the original dataset to carryout all the analysis.
We will take 20% of the original dataset as the sample size.
sample_size=int(0.2*len(cic_df))
sampled_cic_df=cic_df.sample(n=sample_size, replace=False, random_state=42)
sampled_cic_df.shape
(1833454, 59)
sampled_cic_df['isMalicious'].value_counts()
isMalicious 0 1437467 1 395987 Name: count, dtype: int64
sampled_cic_df['ClassLabel'].value_counts()
ClassLabel Benign 1437467 DDoS 246982 DoS 79186 Botnet 29348 Bruteforce 20546 Infiltration 18870 Webattack 625 Portscan 430 Name: count, dtype: int64
- In the sampled dataset, the imbalanced nature of target is very similar to imbalanced nature of original dataset.
- In the sampled dataset, all the categories under the column: ClassLabel are observed in the same order as the original dataset.
sampled_cic_df.dtypes
Flow Duration float64 Total Fwd Packets int32 Total Backward Packets int32 Fwd Packets Length Total float64 Bwd Packets Length Total float64 Fwd Packet Length Max float64 Fwd Packet Length Mean float32 Fwd Packet Length Std float32 Bwd Packet Length Max float64 Bwd Packet Length Mean float32 Bwd Packet Length Std float32 Flow Bytes/s float64 Flow Packets/s float64 Flow IAT Mean float64 Flow IAT Std float32 Flow IAT Max float64 Flow IAT Min float64 Fwd IAT Total float64 Fwd IAT Mean float64 Fwd IAT Std float32 Fwd IAT Max float64 Fwd IAT Min float64 Bwd IAT Total float64 Bwd IAT Mean float32 Bwd IAT Std float32 Bwd IAT Max float64 Bwd IAT Min float64 Fwd PSH Flags int8 Fwd Header Length float64 Bwd Header Length float64 Fwd Packets/s float32 Bwd Packets/s float32 Packet Length Max float64 Packet Length Mean float32 Packet Length Std float32 Packet Length Variance float32 SYN Flag Count int8 URG Flag Count int8 Avg Packet Size float32 Avg Fwd Segment Size float32 Avg Bwd Segment Size float32 Subflow Fwd Packets int32 Subflow Fwd Bytes int32 Subflow Bwd Packets int32 Subflow Bwd Bytes int32 Init Fwd Win Bytes float64 Init Bwd Win Bytes float64 Fwd Act Data Packets int32 Fwd Seg Size Min float64 Active Mean float32 Active Std float32 Active Max float64 Active Min float64 Idle Mean float32 Idle Std float32 Idle Max float64 Idle Min float64 ClassLabel object isMalicious int32 dtype: object
#Identifying outliers in sampled dataset
independent_features=sampled_cic_df.copy()
independent_features=independent_features.drop(['ClassLabel','isMalicious'],axis=1)
q1=independent_features.quantile(0.25)
q3=independent_features.quantile(0.75)
iqr=q3-q1
outlier=(independent_features<(q1-1.5*iqr))|(independent_features>(q3+1.5*iqr))
outlier_count=outlier.sum()
outlier_percentage=round(outlier.mean() * 100, 2)
outlier_stats=pd.concat([outlier_count, outlier_percentage], axis=1)
outlier_stats.columns = ['Outlier Count', 'Outlier Percentage']
print(outlier_stats)
Outlier Count Outlier Percentage Flow Duration 362139 19.75 Total Fwd Packets 167485 9.13 Total Backward Packets 176328 9.62 Fwd Packets Length Total 71763 3.91 Bwd Packets Length Total 265389 14.47 Fwd Packet Length Max 24476 1.33 Fwd Packet Length Mean 74245 4.05 Fwd Packet Length Std 21018 1.15 Bwd Packet Length Max 69888 3.81 Bwd Packet Length Mean 140674 7.67 Bwd Packet Length Std 56299 3.07 Flow Bytes/s 377550 20.59 Flow Packets/s 380170 20.74 Flow IAT Mean 346826 18.92 Flow IAT Std 284585 15.52 Flow IAT Max 255816 13.95 Flow IAT Min 404158 22.04 Fwd IAT Total 352629 19.23 Fwd IAT Mean 355920 19.41 Fwd IAT Std 395445 21.57 Fwd IAT Max 252256 13.76 Fwd IAT Min 420002 22.91 Bwd IAT Total 316674 17.27 Bwd IAT Mean 257401 14.04 Bwd IAT Std 291284 15.89 Bwd IAT Max 266733 14.55 Bwd IAT Min 389592 21.25 Fwd PSH Flags 57754 3.15 Fwd Header Length 127832 6.97 Bwd Header Length 147846 8.06 Fwd Packets/s 379750 20.71 Bwd Packets/s 381136 20.79 Packet Length Max 77407 4.22 Packet Length Mean 179993 9.82 Packet Length Std 68317 3.73 Packet Length Variance 160010 8.73 SYN Flag Count 74451 4.06 URG Flag Count 66218 3.61 Avg Packet Size 177462 9.68 Avg Fwd Segment Size 74245 4.05 Avg Bwd Segment Size 140674 7.67 Subflow Fwd Packets 167485 9.13 Subflow Fwd Bytes 71763 3.91 Subflow Bwd Packets 176328 9.62 Subflow Bwd Bytes 265389 14.47 Init Fwd Win Bytes 719371 39.24 Init Bwd Win Bytes 684281 37.32 Fwd Act Data Packets 122997 6.71 Fwd Seg Size Min 678664 37.02 Active Mean 266956 14.56 Active Std 150242 8.19 Active Max 266956 14.56 Active Min 266956 14.56 Idle Mean 385990 21.05 Idle Std 173591 9.47 Idle Max 385990 21.05 Idle Min 385990 21.05
#Fetching outliers grouped by isMalicious
outlier_counts = {}
for i in independent_features:
for attack_type in sampled_cic_df['isMalicious'].unique():
attack_data = sampled_cic_df[i][sampled_cic_df['isMalicious'] == attack_type]
q1, q3 = np.percentile(attack_data, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
num_outliers = ((attack_data < lower_bound) | (attack_data > upper_bound)).sum()
outlier_percent = num_outliers / len(attack_data) * 100
outlier_counts[(i, attack_type)] = (num_outliers, outlier_percent)
for i in independent_features:
print(f'Feature: {i}')
for attack_type in sampled_cic_df['isMalicious'].unique():
num_outliers, outlier_percent = outlier_counts[(i, attack_type)]
print(f'- {attack_type}: {num_outliers} ({outlier_percent:.2f}%)')
print()
Feature: Flow Duration - 0: 271501 (18.89%) - 1: 69145 (17.46%) Feature: Total Fwd Packets - 0: 82873 (5.77%) - 1: 48965 (12.37%) Feature: Total Backward Packets - 0: 87244 (6.07%) - 1: 23486 (5.93%) Feature: Fwd Packets Length Total - 0: 39934 (2.78%) - 1: 79983 (20.20%) Feature: Bwd Packets Length Total - 0: 177607 (12.36%) - 1: 71509 (18.06%) Feature: Fwd Packet Length Max - 0: 10822 (0.75%) - 1: 2362 (0.60%) Feature: Fwd Packet Length Mean - 0: 28829 (2.01%) - 1: 54263 (13.70%) Feature: Fwd Packet Length Std - 0: 14881 (1.04%) - 1: 533 (0.13%) Feature: Bwd Packet Length Max - 0: 27262 (1.90%) - 1: 48455 (12.24%) Feature: Bwd Packet Length Mean - 0: 121408 (8.45%) - 1: 49662 (12.54%) Feature: Bwd Packet Length Std - 0: 40833 (2.84%) - 1: 48473 (12.24%) Feature: Flow Bytes/s - 0: 319778 (22.25%) - 1: 57618 (14.55%) Feature: Flow Packets/s - 0: 301763 (20.99%) - 1: 51261 (12.95%) Feature: Flow IAT Mean - 0: 258296 (17.97%) - 1: 51632 (13.04%) Feature: Flow IAT Std - 0: 186774 (12.99%) - 1: 80205 (20.25%) Feature: Flow IAT Max - 0: 166932 (11.61%) - 1: 67261 (16.99%) Feature: Flow IAT Min - 0: 308112 (21.43%) - 1: 95849 (24.21%) Feature: Fwd IAT Total - 0: 260387 (18.11%) - 1: 74387 (18.79%) Feature: Fwd IAT Mean - 0: 252056 (17.53%) - 1: 69485 (17.55%) Feature: Fwd IAT Std - 0: 310032 (21.57%) - 1: 91566 (23.12%) Feature: Fwd IAT Max - 0: 159866 (11.12%) - 1: 69246 (17.49%) Feature: Fwd IAT Min - 0: 324450 (22.57%) - 1: 94919 (23.97%) Feature: Bwd IAT Total - 0: 251328 (17.48%) - 1: 44968 (11.36%) Feature: Bwd IAT Mean - 0: 226066 (15.73%) - 1: 62888 (15.88%) Feature: Bwd IAT Std - 0: 223055 (15.52%) - 1: 80501 (20.33%) Feature: Bwd IAT Max - 0: 236348 (16.44%) - 1: 49815 (12.58%) Feature: Bwd IAT Min - 0: 336707 (23.42%) - 1: 32229 (8.14%) Feature: Fwd PSH Flags - 0: 54784 (3.81%) - 1: 2970 (0.75%) Feature: Fwd Header Length - 0: 83369 (5.80%) - 1: 60675 (15.32%) Feature: Bwd Header Length - 0: 115152 (8.01%) - 1: 34435 (8.70%) Feature: Fwd Packets/s - 0: 298236 (20.75%) - 1: 51461 (13.00%) Feature: Bwd Packets/s - 0: 313900 (21.84%) - 1: 74384 (18.78%) Feature: Packet Length Max - 0: 28885 (2.01%) - 1: 48465 (12.24%) Feature: Packet Length Mean - 0: 85914 (5.98%) - 1: 26172 (6.61%) Feature: Packet Length Std - 0: 20542 (1.43%) - 1: 48486 (12.24%) Feature: Packet Length Variance - 0: 111343 (7.75%) - 1: 49627 (12.53%) Feature: SYN Flag Count - 0: 69042 (4.80%) - 1: 5409 (1.37%) Feature: URG Flag Count - 0: 63194 (4.40%) - 1: 3024 (0.76%) Feature: Avg Packet Size - 0: 78454 (5.46%) - 1: 18905 (4.77%) Feature: Avg Fwd Segment Size - 0: 28829 (2.01%) - 1: 54263 (13.70%) Feature: Avg Bwd Segment Size - 0: 121408 (8.45%) - 1: 49662 (12.54%) Feature: Subflow Fwd Packets - 0: 82873 (5.77%) - 1: 48965 (12.37%) Feature: Subflow Fwd Bytes - 0: 39934 (2.78%) - 1: 79983 (20.20%) Feature: Subflow Bwd Packets - 0: 87244 (6.07%) - 1: 23486 (5.93%) Feature: Subflow Bwd Bytes - 0: 177607 (12.36%) - 1: 71509 (18.06%) Feature: Init Fwd Win Bytes - 0: 493994 (34.37%) - 1: 76819 (19.40%) Feature: Init Bwd Win Bytes - 0: 349532 (24.32%) - 1: 24436 (6.17%) Feature: Fwd Act Data Packets - 0: 76932 (5.35%) - 1: 50263 (12.69%) Feature: Fwd Seg Size Min - 0: 562595 (39.14%) - 1: 116069 (29.31%) Feature: Active Mean - 0: 207682 (14.45%) - 1: 59274 (14.97%) Feature: Active Std - 0: 138991 (9.67%) - 1: 11251 (2.84%) Feature: Active Max - 0: 207682 (14.45%) - 1: 59274 (14.97%) Feature: Active Min - 0: 207682 (14.45%) - 1: 59274 (14.97%) Feature: Idle Mean - 0: 272236 (18.94%) - 1: 64936 (16.40%) Feature: Idle Std - 0: 156188 (10.87%) - 1: 17403 (4.39%) Feature: Idle Max - 0: 272236 (18.94%) - 1: 67834 (17.13%) Feature: Idle Min - 0: 272236 (18.94%) - 1: 68185 (17.22%)
#Fetching outliers grouped by ClassLabel
outlier_counts = {}
for i in independent_features:
for attack_type in sampled_cic_df['ClassLabel'].unique():
attack_data = sampled_cic_df[i][sampled_cic_df['ClassLabel'] == attack_type]
q1, q3 = np.percentile(attack_data, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
num_outliers = ((attack_data < lower_bound) | (attack_data > upper_bound)).sum()
outlier_percent = num_outliers / len(attack_data) * 100
outlier_counts[(i, attack_type)] = (num_outliers, outlier_percent)
for i in independent_features:
print(f'Feature: {i}')
for attack_type in sampled_cic_df['ClassLabel'].unique():
num_outliers, outlier_percent = outlier_counts[(i, attack_type)]
print(f'- {attack_type}: {num_outliers} ({outlier_percent:.2f}%)')
print()
Feature: Flow Duration - Benign: 271501 (18.89%) - DDoS: 39367 (15.94%) - DoS: 0 (0.00%) - Infiltration: 3981 (21.10%) - Botnet: 3135 (10.68%) - Bruteforce: 2609 (12.70%) - Portscan: 94 (21.86%) - Webattack: 200 (32.00%) Feature: Total Fwd Packets - Benign: 82873 (5.77%) - DDoS: 35630 (14.43%) - DoS: 1542 (1.95%) - Infiltration: 1110 (5.88%) - Botnet: 1419 (4.84%) - Bruteforce: 2602 (12.66%) - Portscan: 34 (7.91%) - Webattack: 228 (36.48%) Feature: Total Backward Packets - Benign: 87244 (6.07%) - DDoS: 199 (0.08%) - DoS: 85 (0.11%) - Infiltration: 1194 (6.33%) - Botnet: 1540 (5.25%) - Bruteforce: 1912 (9.31%) - Portscan: 163 (37.91%) - Webattack: 211 (33.76%) Feature: Fwd Packets Length Total - Benign: 39934 (2.78%) - DDoS: 53978 (21.86%) - DoS: 3215 (4.06%) - Infiltration: 504 (2.67%) - Botnet: 1937 (6.60%) - Bruteforce: 2495 (12.14%) - Portscan: 70 (16.28%) - Webattack: 149 (23.84%) Feature: Bwd Packets Length Total - Benign: 177607 (12.36%) - DDoS: 16430 (6.65%) - DoS: 24 (0.03%) - Infiltration: 2138 (11.33%) - Botnet: 1610 (5.49%) - Bruteforce: 1913 (9.31%) - Portscan: 36 (8.37%) - Webattack: 122 (19.52%) Feature: Fwd Packet Length Max - Benign: 10822 (0.75%) - DDoS: 1658 (0.67%) - DoS: 117 (0.15%) - Infiltration: 575 (3.05%) - Botnet: 1937 (6.60%) - Bruteforce: 1330 (6.47%) - Portscan: 75 (17.44%) - Webattack: 149 (23.84%) Feature: Fwd Packet Length Mean - Benign: 28829 (2.01%) - DDoS: 53668 (21.73%) - DoS: 1100 (1.39%) - Infiltration: 707 (3.75%) - Botnet: 1937 (6.60%) - Bruteforce: 1822 (8.87%) - Portscan: 65 (15.12%) - Webattack: 149 (23.84%) Feature: Fwd Packet Length Std - Benign: 14881 (1.04%) - DDoS: 44707 (18.10%) - DoS: 137 (0.17%) - Infiltration: 351 (1.86%) - Botnet: 1937 (6.60%) - Bruteforce: 2556 (12.44%) - Portscan: 30 (6.98%) - Webattack: 122 (19.52%) Feature: Bwd Packet Length Max - Benign: 27262 (1.90%) - DDoS: 16149 (6.54%) - DoS: 1 (0.00%) - Infiltration: 0 (0.00%) - Botnet: 1610 (5.49%) - Bruteforce: 1327 (6.46%) - Portscan: 36 (8.37%) - Webattack: 122 (19.52%) Feature: Bwd Packet Length Mean - Benign: 121408 (8.45%) - DDoS: 16161 (6.54%) - DoS: 0 (0.00%) - Infiltration: 1013 (5.37%) - Botnet: 1764 (6.01%) - Bruteforce: 1908 (9.29%) - Portscan: 34 (7.91%) - Webattack: 122 (19.52%) Feature: Bwd Packet Length Std - Benign: 40833 (2.84%) - DDoS: 16141 (6.54%) - DoS: 402 (0.51%) - Infiltration: 74 (0.39%) - Botnet: 1764 (6.01%) - Bruteforce: 1911 (9.30%) - Portscan: 36 (8.37%) - Webattack: 122 (19.52%) Feature: Flow Bytes/s - Benign: 319778 (22.25%) - DDoS: 44053 (17.84%) - DoS: 10132 (12.80%) - Infiltration: 4291 (22.74%) - Botnet: 2845 (9.69%) - Bruteforce: 3082 (15.00%) - Portscan: 77 (17.91%) - Webattack: 149 (23.84%) Feature: Flow Packets/s - Benign: 301763 (20.99%) - DDoS: 43821 (17.74%) - DoS: 9453 (11.94%) - Infiltration: 3039 (16.10%) - Botnet: 2813 (9.58%) - Bruteforce: 2635 (12.82%) - Portscan: 73 (16.98%) - Webattack: 104 (16.64%) Feature: Flow IAT Mean - Benign: 258296 (17.97%) - DDoS: 44064 (17.84%) - DoS: 2243 (2.83%) - Infiltration: 3717 (19.70%) - Botnet: 3104 (10.58%) - Bruteforce: 2283 (11.11%) - Portscan: 94 (21.86%) - Webattack: 28 (4.48%) Feature: Flow IAT Std - Benign: 186774 (12.99%) - DDoS: 40394 (16.36%) - DoS: 402 (0.51%) - Infiltration: 3332 (17.66%) - Botnet: 3246 (11.06%) - Bruteforce: 2107 (10.26%) - Portscan: 91 (21.16%) - Webattack: 27 (4.32%) Feature: Flow IAT Max - Benign: 166932 (11.61%) - DDoS: 35723 (14.46%) - DoS: 0 (0.00%) - Infiltration: 1925 (10.20%) - Botnet: 3199 (10.90%) - Bruteforce: 2089 (10.17%) - Portscan: 93 (21.63%) - Webattack: 184 (29.44%) Feature: Flow IAT Min - Benign: 308112 (21.43%) - DDoS: 58129 (23.54%) - DoS: 8191 (10.34%) - Infiltration: 3260 (17.28%) - Botnet: 444 (1.51%) - Bruteforce: 658 (3.20%) - Portscan: 27 (6.28%) - Webattack: 44 (7.04%) Feature: Fwd IAT Total - Benign: 260387 (18.11%) - DDoS: 39677 (16.06%) - DoS: 0 (0.00%) - Infiltration: 3643 (19.31%) - Botnet: 2040 (6.95%) - Bruteforce: 2630 (12.80%) - Portscan: 92 (21.40%) - Webattack: 211 (33.76%) Feature: Fwd IAT Mean - Benign: 252056 (17.53%) - DDoS: 41957 (16.99%) - DoS: 1833 (2.31%) - Infiltration: 3998 (21.19%) - Botnet: 2218 (7.56%) - Bruteforce: 2114 (10.29%) - Portscan: 87 (20.23%) - Webattack: 28 (4.48%) Feature: Fwd IAT Std - Benign: 310032 (21.57%) - DDoS: 49536 (20.06%) - DoS: 0 (0.00%) - Infiltration: 4127 (21.87%) - Botnet: 2258 (7.69%) - Bruteforce: 2048 (9.97%) - Portscan: 42 (9.77%) - Webattack: 27 (4.32%) Feature: Fwd IAT Max - Benign: 159866 (11.12%) - DDoS: 36899 (14.94%) - DoS: 0 (0.00%) - Infiltration: 3437 (18.21%) - Botnet: 2002 (6.82%) - Bruteforce: 2124 (10.34%) - Portscan: 92 (21.40%) - Webattack: 28 (4.48%) Feature: Fwd IAT Min - Benign: 324450 (22.57%) - DDoS: 57863 (23.43%) - DoS: 8128 (10.26%) - Infiltration: 4453 (23.60%) - Botnet: 5116 (17.43%) - Bruteforce: 947 (4.61%) - Portscan: 52 (12.09%) - Webattack: 37 (5.92%) Feature: Bwd IAT Total - Benign: 251328 (17.48%) - DDoS: 4797 (1.94%) - DoS: 14913 (18.83%) - Infiltration: 3845 (20.38%) - Botnet: 3853 (13.13%) - Bruteforce: 2609 (12.70%) - Portscan: 106 (24.65%) - Webattack: 122 (19.52%) Feature: Bwd IAT Mean - Benign: 226066 (15.73%) - DDoS: 4753 (1.92%) - DoS: 15771 (19.92%) - Infiltration: 3809 (20.19%) - Botnet: 3866 (13.17%) - Bruteforce: 2034 (9.90%) - Portscan: 106 (24.65%) - Webattack: 122 (19.52%) Feature: Bwd IAT Std - Benign: 223055 (15.52%) - DDoS: 4616 (1.87%) - DoS: 19481 (24.60%) - Infiltration: 3904 (20.69%) - Botnet: 3885 (13.24%) - Bruteforce: 2211 (10.76%) - Portscan: 28 (6.51%) - Webattack: 122 (19.52%) Feature: Bwd IAT Max - Benign: 236348 (16.44%) - DDoS: 4705 (1.90%) - DoS: 12206 (15.41%) - Infiltration: 4209 (22.31%) - Botnet: 3844 (13.10%) - Bruteforce: 2125 (10.34%) - Portscan: 106 (24.65%) - Webattack: 122 (19.52%) Feature: Bwd IAT Min - Benign: 336707 (23.42%) - DDoS: 7815 (3.16%) - DoS: 12807 (16.17%) - Infiltration: 3848 (20.39%) - Botnet: 836 (2.85%) - Bruteforce: 1409 (6.86%) - Portscan: 106 (24.65%) - Webattack: 122 (19.52%) Feature: Fwd PSH Flags - Benign: 54784 (3.81%) - DDoS: 6 (0.00%) - DoS: 944 (1.19%) - Infiltration: 1598 (8.47%) - Botnet: 1 (0.00%) - Bruteforce: 421 (2.05%) - Portscan: 0 (0.00%) - Webattack: 0 (0.00%) Feature: Fwd Header Length - Benign: 83369 (5.80%) - DDoS: 28846 (11.68%) - DoS: 1957 (2.47%) - Infiltration: 808 (4.28%) - Botnet: 1535 (5.23%) - Bruteforce: 2602 (12.66%) - Portscan: 2 (0.47%) - Webattack: 237 (37.92%) Feature: Bwd Header Length - Benign: 115152 (8.01%) - DDoS: 198 (0.08%) - DoS: 84 (0.11%) - Infiltration: 1574 (8.34%) - Botnet: 1540 (5.25%) - Bruteforce: 1912 (9.31%) - Portscan: 39 (9.07%) - Webattack: 237 (37.92%) Feature: Fwd Packets/s - Benign: 298236 (20.75%) - DDoS: 43502 (17.61%) - DoS: 7368 (9.30%) - Infiltration: 2986 (15.82%) - Botnet: 3398 (11.58%) - Bruteforce: 2352 (11.45%) - Portscan: 77 (17.91%) - Webattack: 104 (16.64%) Feature: Bwd Packets/s - Benign: 313900 (21.84%) - DDoS: 40283 (16.31%) - DoS: 16836 (21.26%) - Infiltration: 3723 (19.73%) - Botnet: 3438 (11.71%) - Bruteforce: 2355 (11.46%) - Portscan: 68 (15.81%) - Webattack: 237 (37.92%) Feature: Packet Length Max - Benign: 28885 (2.01%) - DDoS: 16149 (6.54%) - DoS: 1 (0.00%) - Infiltration: 0 (0.00%) - Botnet: 1150 (3.92%) - Bruteforce: 1328 (6.46%) - Portscan: 192 (44.65%) - Webattack: 149 (23.84%) Feature: Packet Length Mean - Benign: 85914 (5.98%) - DDoS: 13035 (5.28%) - DoS: 0 (0.00%) - Infiltration: 580 (3.07%) - Botnet: 1377 (4.69%) - Bruteforce: 3835 (18.67%) - Portscan: 201 (46.74%) - Webattack: 149 (23.84%) Feature: Packet Length Std - Benign: 20542 (1.43%) - DDoS: 16149 (6.54%) - DoS: 28 (0.04%) - Infiltration: 17 (0.09%) - Botnet: 1377 (4.69%) - Bruteforce: 3530 (17.18%) - Portscan: 36 (8.37%) - Webattack: 122 (19.52%) Feature: Packet Length Variance - Benign: 111343 (7.75%) - DDoS: 16161 (6.54%) - DoS: 2390 (3.02%) - Infiltration: 1414 (7.49%) - Botnet: 1377 (4.69%) - Bruteforce: 3530 (17.18%) - Portscan: 40 (9.30%) - Webattack: 122 (19.52%) Feature: SYN Flag Count - Benign: 69042 (4.80%) - DDoS: 1256 (0.51%) - DoS: 2133 (2.69%) - Infiltration: 1598 (8.47%) - Botnet: 1 (0.00%) - Bruteforce: 421 (2.05%) - Portscan: 0 (0.00%) - Webattack: 0 (0.00%) Feature: URG Flag Count - Benign: 63194 (4.40%) - DDoS: 96 (0.04%) - DoS: 2260 (2.85%) - Infiltration: 497 (2.63%) - Botnet: 41 (0.14%) - Bruteforce: 96 (0.47%) - Portscan: 9 (2.09%) - Webattack: 25 (4.00%) Feature: Avg Packet Size - Benign: 78454 (5.46%) - DDoS: 13314 (5.39%) - DoS: 0 (0.00%) - Infiltration: 648 (3.43%) - Botnet: 1377 (4.69%) - Bruteforce: 3835 (18.67%) - Portscan: 73 (16.98%) - Webattack: 149 (23.84%) Feature: Avg Fwd Segment Size - Benign: 28829 (2.01%) - DDoS: 53668 (21.73%) - DoS: 1100 (1.39%) - Infiltration: 707 (3.75%) - Botnet: 1937 (6.60%) - Bruteforce: 1822 (8.87%) - Portscan: 65 (15.12%) - Webattack: 149 (23.84%) Feature: Avg Bwd Segment Size - Benign: 121408 (8.45%) - DDoS: 16161 (6.54%) - DoS: 0 (0.00%) - Infiltration: 1013 (5.37%) - Botnet: 1764 (6.01%) - Bruteforce: 1908 (9.29%) - Portscan: 34 (7.91%) - Webattack: 122 (19.52%) Feature: Subflow Fwd Packets - Benign: 82873 (5.77%) - DDoS: 35630 (14.43%) - DoS: 1542 (1.95%) - Infiltration: 1110 (5.88%) - Botnet: 1419 (4.84%) - Bruteforce: 2602 (12.66%) - Portscan: 34 (7.91%) - Webattack: 228 (36.48%) Feature: Subflow Fwd Bytes - Benign: 39934 (2.78%) - DDoS: 53978 (21.86%) - DoS: 3215 (4.06%) - Infiltration: 504 (2.67%) - Botnet: 1937 (6.60%) - Bruteforce: 2495 (12.14%) - Portscan: 70 (16.28%) - Webattack: 149 (23.84%) Feature: Subflow Bwd Packets - Benign: 87244 (6.07%) - DDoS: 199 (0.08%) - DoS: 85 (0.11%) - Infiltration: 1194 (6.33%) - Botnet: 1540 (5.25%) - Bruteforce: 1912 (9.31%) - Portscan: 163 (37.91%) - Webattack: 211 (33.76%) Feature: Subflow Bwd Bytes - Benign: 177607 (12.36%) - DDoS: 16430 (6.65%) - DoS: 24 (0.03%) - Infiltration: 2138 (11.33%) - Botnet: 1610 (5.49%) - Bruteforce: 1913 (9.31%) - Portscan: 36 (8.37%) - Webattack: 122 (19.52%) Feature: Init Fwd Win Bytes - Benign: 493994 (34.37%) - DDoS: 39933 (16.17%) - DoS: 29554 (37.32%) - Infiltration: 4293 (22.75%) - Botnet: 279 (0.95%) - Bruteforce: 1902 (9.26%) - Portscan: 0 (0.00%) - Webattack: 0 (0.00%) Feature: Init Bwd Win Bytes - Benign: 349532 (24.32%) - DDoS: 9377 (3.80%) - DoS: 14328 (18.09%) - Infiltration: 4318 (22.88%) - Botnet: 1557 (5.31%) - Bruteforce: 1873 (9.12%) - Portscan: 0 (0.00%) - Webattack: 0 (0.00%) Feature: Fwd Act Data Packets - Benign: 76932 (5.35%) - DDoS: 27532 (11.15%) - DoS: 9955 (12.57%) - Infiltration: 772 (4.09%) - Botnet: 1535 (5.23%) - Bruteforce: 1345 (6.55%) - Portscan: 9 (2.09%) - Webattack: 149 (23.84%) Feature: Fwd Seg Size Min - Benign: 562595 (39.14%) - DDoS: 25318 (10.25%) - DoS: 20310 (25.65%) - Infiltration: 447 (2.37%) - Botnet: 173 (0.59%) - Bruteforce: 11 (0.05%) - Portscan: 5 (1.16%) - Webattack: 1 (0.16%) Feature: Active Mean - Benign: 207682 (14.45%) - DDoS: 19286 (7.81%) - DoS: 14493 (18.30%) - Infiltration: 4527 (23.99%) - Botnet: 50 (0.17%) - Bruteforce: 0 (0.00%) - Portscan: 61 (14.19%) - Webattack: 27 (4.32%) Feature: Active Std - Benign: 138991 (9.67%) - DDoS: 6682 (2.71%) - DoS: 1859 (2.35%) - Infiltration: 2659 (14.09%) - Botnet: 48 (0.16%) - Bruteforce: 0 (0.00%) - Portscan: 3 (0.70%) - Webattack: 0 (0.00%) Feature: Active Max - Benign: 207682 (14.45%) - DDoS: 19286 (7.81%) - DoS: 14510 (18.32%) - Infiltration: 4527 (23.99%) - Botnet: 50 (0.17%) - Bruteforce: 0 (0.00%) - Portscan: 61 (14.19%) - Webattack: 27 (4.32%) Feature: Active Min - Benign: 207682 (14.45%) - DDoS: 19286 (7.81%) - DoS: 13659 (17.25%) - Infiltration: 4527 (23.99%) - Botnet: 50 (0.17%) - Bruteforce: 0 (0.00%) - Portscan: 61 (14.19%) - Webattack: 27 (4.32%) Feature: Idle Mean - Benign: 272236 (18.94%) - DDoS: 33705 (13.65%) - DoS: 0 (0.00%) - Infiltration: 1829 (9.69%) - Botnet: 50 (0.17%) - Bruteforce: 0 (0.00%) - Portscan: 61 (14.19%) - Webattack: 28 (4.48%) Feature: Idle Std - Benign: 156188 (10.87%) - DDoS: 10516 (4.26%) - DoS: 3548 (4.48%) - Infiltration: 3260 (17.28%) - Botnet: 48 (0.16%) - Bruteforce: 0 (0.00%) - Portscan: 3 (0.70%) - Webattack: 28 (4.48%) Feature: Idle Max - Benign: 272236 (18.94%) - DDoS: 35712 (14.46%) - DoS: 0 (0.00%) - Infiltration: 1921 (10.18%) - Botnet: 50 (0.17%) - Bruteforce: 0 (0.00%) - Portscan: 61 (14.19%) - Webattack: 28 (4.48%) Feature: Idle Min - Benign: 272236 (18.94%) - DDoS: 35579 (14.41%) - DoS: 0 (0.00%) - Infiltration: 1680 (8.90%) - Botnet: 50 (0.17%) - Bruteforce: 0 (0.00%) - Portscan: 61 (14.19%) - Webattack: 28 (4.48%)
Based on observations from sampled dataset, out of 57 independent features, 12 features have outliers whose percentage of difference betweem Malicious and Benign events is greater than or equal to 10%.
- Fwd Packets Length Total: Malicious > Benign: Difference = 17%
- Bwd Packet Length Max: Malicious > Benign: Difference = 10%
- Bwd Packet Length Std: Malicious > Benign: Difference = 10%
- Bwd IAT Mean: Malicious < Benign: Difference = 15%
- Fwd Header Length: Malicious > Benign: Difference = 10%
- Packet Length Max: Malicious > Benign: Diffence = 10%
- Packet Length Std: Malicious > Benign: Difference = 11%
- Avg Fwd Segment Size: Malicious > Benign: Difference = 11%
- Subflow Fwd Bytes: Malicious > Benign: Difference = 18%
- Init Fwd Win Bytes: Malicious < Benign: Difference = 15%
- Init Bwd Win Bytes: Malicious < Benign: Difference = 18%
- Fwd Seg Size Min: Malicious < Benign: Difference = 10%
Remaining 45 features have very similar percentage (nearly equal percentage) of outliers labelled as Benign and Malicious.
Out of the 12 features listed above, we have following 4 features with higher outliers percentage: -
- Init Fwd Win Bytes: 39.24% records are outliers
- Init Bwd Win Bytes: 37.32% records are outliers
- Fwd Seg Size Min: 37.02% records are outliers
- Bwd IAT Mean: 14.04% records are outliers
Among the above 4 features, Benign records are more than Malicious. Thus, the features with relatively higher number of outliers do not indicate any anomaly or provide differentiation to detect Malicious events.
Remaining 8 features have less than or equal to 6% of records as outliers.
- Fwd Packets Length Total
- Bwd Packet Length Max
- Bwd Packet Length Std
- Fwd Header Length
- Packet Length Max
- Packet Length Std
- Avg Fwd Segment Size
- Subflow Fwd Bytes
All the above 8 features have Malicious events are more than Benign events. Thus, the features with relatively lesser number of outliers help to provide small differentiation of Malicious events over Benign events. As the result, they can also be useful as anomalies in dataset indicating towards Malicious events.
Thus, features with higher number of outliers have more number of benign events than malicious events. And features with lesser number of outliers have more number of malicious events than benign events. And all 12 features have outliers with difference betweeen the two categories greater than or equal to 10%.
To handle outliers, we shall try to use 2 methods: -
- Winsorization
- Robust Scaling
We shall perform winsorization on following 4 features: -
- Init Fwd Win Bytes
- Init Bwd Win Bytes
- Fwd Seg Size Min
- Bwd IAT Mean
Init Fwd Win Bytes
plt.hist(sampled_cic_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
plt.xlabel("Init Fwd Win Bytes")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram: Init Fwd Win Bytes")
plt.show()
upper_limit=sampled_cic_df["Init Fwd Win Bytes"].quantile(0.95)
lower_limit=sampled_cic_df["Init Fwd Win Bytes"].quantile(0.05)
print("Upper limit: ",upper_limit)
print("Lower limit: ",lower_limit)
print("Maximum value: ",sampled_cic_df["Init Fwd Win Bytes"].max())
print("Minimum value: ",sampled_cic_df["Init Fwd Win Bytes"].min())
print("Median value: ",sampled_cic_df["Init Fwd Win Bytes"].median())
print("Standard deviation value: ",sampled_cic_df["Init Fwd Win Bytes"].std())
Upper limit: 65535.0 Lower limit: 219.0 Maximum value: 65535.0 Minimum value: 0.0 Median value: 8192.0 Standard deviation value: 17920.85671285462
new_df=pd.DataFrame()
new_df["Init Fwd Win Bytes"]=np.where(sampled_cic_df["Init Fwd Win Bytes"]>=upper_limit,
upper_limit,
np.where(sampled_cic_df["Init Fwd Win Bytes"]<=lower_limit,
lower_limit,
sampled_cic_df["Init Fwd Win Bytes"]))
print("Maximum value: ",new_df["Init Fwd Win Bytes"].max())
print("Minimum value: ",new_df["Init Fwd Win Bytes"].min())
print("Median value: ",new_df["Init Fwd Win Bytes"].median())
print("Standard deviation value: ",new_df["Init Fwd Win Bytes"].std())
print("Upper limit: ",new_df["Init Fwd Win Bytes"].quantile(0.95))
print("Lower limit: ",new_df["Init Fwd Win Bytes"].quantile(0.05))
Maximum value: 65535.0 Minimum value: 219.0 Median value: 8192.0 Standard deviation value: 17917.485550925405 Upper limit: 65535.0 Lower limit: 219.0
plt.hist(new_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
plt.xlabel("Init Fwd Win Bytes")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram after winsorization: Init Fwd Win Bytes")
plt.show()
#Plotting both the histograms in parallel for easier comparison
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
ax1.set_xlabel("Init Fwd Win Bytes")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior winsorization")
ax2=plt.subplot(2,1,2)
ax2.hist(new_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
ax2.set_xlabel("Init Fwd Win Bytes")
ax2.set_ylabel("Frequency")
ax2.set_title("After winsorization")
plt.tight_layout()
plt.show()
We observed after performing winsorization on sampled data for Init Fwd Win Bytes, the distribution in above histograms looks very similar to original sampled dataset's distribution.
Median value has remained constant=8192.0
Standard deviation value has slightly reduced from 17920 to 17917.
Init Bwd Win Bytes
plt.hist(sampled_cic_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
plt.xlabel("Init Bwd Win Bytes")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram: Init Bwd Win Bytes")
plt.show()
upper_limit=sampled_cic_df["Init Bwd Win Bytes"].quantile(0.95)
lower_limit=sampled_cic_df["Init Bwd Win Bytes"].quantile(0.05)
print("Upper limit: ",upper_limit)
print("Lower limit: ",lower_limit)
print("Maximum value: ",sampled_cic_df["Init Bwd Win Bytes"].max())
print("Minimum value: ",sampled_cic_df["Init Bwd Win Bytes"].min())
print("Median value: ",sampled_cic_df["Init Bwd Win Bytes"].median())
print("Standard deviation value: ",sampled_cic_df["Init Bwd Win Bytes"].std())
Upper limit: 62856.0 Lower limit: 31.0 Maximum value: 65535.0 Minimum value: 0.0 Median value: 235.0 Standard deviation value: 19414.847112640815
new_df["Init Bwd Win Bytes"]=np.where(sampled_cic_df["Init Bwd Win Bytes"]>=upper_limit,
upper_limit,
np.where(sampled_cic_df["Init Bwd Win Bytes"]<=lower_limit,
lower_limit,
sampled_cic_df["Init Bwd Win Bytes"]))
print("Maximum value: ",new_df["Init Bwd Win Bytes"].max())
print("Minimum value: ",new_df["Init Bwd Win Bytes"].min())
print("Median value: ",new_df["Init Bwd Win Bytes"].median())
print("Standard deviation value: ",new_df["Init Bwd Win Bytes"].std())
print("Upper limit: ",new_df["Init Bwd Win Bytes"].quantile(0.95))
print("Lower limit: ",new_df["Init Bwd Win Bytes"].quantile(0.05))
Maximum value: 62856.0 Minimum value: 31.0 Median value: 235.0 Standard deviation value: 19358.862080156487 Upper limit: 62856.0 Lower limit: 31.0
plt.hist(new_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
plt.xlabel("Init Bwd Win Bytes")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram after winsorization: Init Bwd Win Bytes")
plt.show()
#Plotting both the histograms in parallel for easier comparison
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
ax1.set_xlabel("Init Bwd Win Bytes")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior winsorization")
ax2=plt.subplot(2,1,2)
ax2.hist(new_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
ax2.set_xlabel("Init Bwd Win Bytes")
ax2.set_ylabel("Frequency")
ax2.set_title("After winsorization")
plt.tight_layout()
plt.show()
We observed after performing winsorization on sampled data for Init Bwd Win Bytes, the distribution in above histograms looks very similar to original sampled dataset's distribution.
Median value has remained constant=235.0
Standard deviation value has slightly reduced from 19414 to 19358.
Fwd Seg Size Min
plt.hist(sampled_cic_df["Fwd Seg Size Min"], edgecolor="black", log=True)
plt.xlabel("Fwd Seg Size Min")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram: Fwd Seg Size Min")
plt.show()
upper_limit=sampled_cic_df["Fwd Seg Size Min"].quantile(0.95)
lower_limit=sampled_cic_df["Fwd Seg Size Min"].quantile(0.05)
print("Upper limit: ",upper_limit)
print("Lower limit: ",lower_limit)
print("Maximum value: ",sampled_cic_df["Fwd Seg Size Min"].max())
print("Minimum value: ",sampled_cic_df["Fwd Seg Size Min"].min())
print("Median value: ",sampled_cic_df["Fwd Seg Size Min"].median())
print("Standard deviation value: ",sampled_cic_df["Fwd Seg Size Min"].std())
Upper limit: 32.0 Lower limit: 8.0 Maximum value: 1480.0 Minimum value: 0.0 Median value: 20.0 Standard deviation value: 25.971709364062733
new_df["Fwd Seg Size Min"]=np.where(sampled_cic_df["Fwd Seg Size Min"]>=upper_limit,
upper_limit,
np.where(sampled_cic_df["Fwd Seg Size Min"]<=lower_limit,
lower_limit,
sampled_cic_df["Fwd Seg Size Min"]))
print("Maximum value: ",new_df["Fwd Seg Size Min"].max())
print("Minimum value: ",new_df["Fwd Seg Size Min"].min())
print("Median value: ",new_df["Fwd Seg Size Min"].median())
print("Standard deviation value: ",new_df["Fwd Seg Size Min"].std())
print("Upper limit: ",new_df["Fwd Seg Size Min"].quantile(0.95))
print("Lower limit: ",new_df["Fwd Seg Size Min"].quantile(0.05))
Maximum value: 32.0 Minimum value: 8.0 Median value: 20.0 Standard deviation value: 7.262473158983896 Upper limit: 32.0 Lower limit: 8.0
plt.hist(new_df["Fwd Seg Size Min"], edgecolor="black", log=True)
plt.xlabel("Fwd Seg Size Min")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram after winsorization: Fwd Seg Size Min")
plt.show()
#Plotting both the histograms in parallel for easier comparison
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Fwd Seg Size Min"], edgecolor="black", log=True)
ax1.set_xlabel("Fwd Seg Size Min")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior winsorization")
ax2=plt.subplot(2,1,2)
ax2.hist(new_df["Fwd Seg Size Min"], edgecolor="black", log=True)
ax2.set_xlabel("Fwd Seg Size Min")
ax2.set_ylabel("Frequency")
ax2.set_title("After winsorization")
plt.tight_layout()
plt.show()
sampled_cic_df[sampled_cic_df['Fwd Seg Size Min'] > 32]['Fwd Seg Size Min'].count()
17305
We observed after performing winsorization on sampled data for Fwd Seg Size Min, the distribution has changed drastically compared to earlier histogram of sampled dataset's distribution.
Median value has remained constant=20.0
Standard deviation value has reduced from 25.97 to 7.26
Maximum value in the original sampled dataset is 1480 and maximum value after winsorization is 32. Number of values in original sampled dataset between 32 and 1480 = 17305.
Bwd IAT Mean
plt.hist(sampled_cic_df["Bwd IAT Mean"], edgecolor="black", log=True)
plt.xlabel("Bwd IAT Mean")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram: Bwd IAT Mean")
plt.show()
upper_limit=sampled_cic_df["Bwd IAT Mean"].quantile(0.95)
lower_limit=sampled_cic_df["Bwd IAT Mean"].quantile(0.05)
print("Upper limit: ",upper_limit)
print("Lower limit: ",lower_limit)
print("Maximum value: ",sampled_cic_df["Bwd IAT Mean"].max())
print("Minimum value: ",sampled_cic_df["Bwd IAT Mean"].min())
print("Median value: ",sampled_cic_df["Bwd IAT Mean"].median())
print("Standard deviation value: ",sampled_cic_df["Bwd IAT Mean"].std())
Upper limit: 6501928.974999998 Lower limit: 0.0 Maximum value: 120000000.0 Minimum value: 0.0 Median value: 647.0 Standard deviation value: 6192044.5
new_df["Bwd IAT Mean"]=np.where(sampled_cic_df["Bwd IAT Mean"]>=upper_limit,
upper_limit,
np.where(sampled_cic_df["Bwd IAT Mean"]<=lower_limit,
lower_limit,
sampled_cic_df["Bwd IAT Mean"]))
print("Maximum value: ",new_df["Bwd IAT Mean"].max())
print("Minimum value: ",new_df["Bwd IAT Mean"].min())
print("Median value: ",new_df["Bwd IAT Mean"].median())
print("Standard deviation value: ",new_df["Bwd IAT Mean"].std())
print("Upper limit: ",new_df["Bwd IAT Mean"].quantile(0.95))
print("Lower limit: ",new_df["Bwd IAT Mean"].quantile(0.05))
Maximum value: 6501929.0 Minimum value: 0.0 Median value: 647.0 Standard deviation value: 1657361.2 Upper limit: 6501924.774999999 Lower limit: 0.0
plt.hist(new_df["Bwd IAT Mean"], edgecolor="black", log=True)
plt.xlabel("Bwd IAT Mean")
plt.ylabel("Frequency")
plt.title("Sampled data, log transformed histogram after winsorization: Bwd IAT Mean")
plt.show()
#Plotting both the histograms in parallel for easier comparison
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Bwd IAT Mean"], edgecolor="black", log=True)
ax1.set_xlabel("Bwd IAT Mean")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior winsorization")
ax2=plt.subplot(2,1,2)
ax2.hist(new_df["Bwd IAT Mean"], edgecolor="black", log=True)
ax2.set_xlabel("Bwd IAT Mean")
ax2.set_ylabel("Frequency")
ax2.set_title("After winsorization")
plt.tight_layout()
plt.show()
sampled_cic_df[sampled_cic_df['Bwd IAT Mean'] > 6501929]['Bwd IAT Mean'].count()
91673
We observed after performing winsorization on sampled data for Bwd IAT Mean, the distribution has changed compared to earlier histogram of sampled dataset's distribution. The main peak on first bin has remain unchanged. There is a new peak observed towards the right hand side of the chart, may be due to winsorization the outlier values have got added to the last bin and as the result its frequency increased.
Median value has remained constant=647
Standard deviation value has reduced from 6192044.5 to 1657361.2
Maximum value in the original sampled dataset is 120000000 and maximum value after winsorization is 6501929. Number of values in original sampled dataset between 6501929 and 120000000 = 91673.
We will see use RobustScaler to transform the features and check the results to compare with the results of Winsorization and understand how both impact the data and handle outliers.
We shall perform Robust Scaling on following 4 features: -
- Init Fwd Win Bytes
- Init Bwd Win Bytes
- Fwd Seg Size Min
- Bwd IAT Mean
#Creating a copy of sampled dataset with the 4 features.
robust_scaler_test=sampled_cic_df[['Init Fwd Win Bytes','Init Bwd Win Bytes','Fwd Seg Size Min','Bwd IAT Mean']]
robust_scaler_test.head(10)
robust_scaler_test.reset_index()
| index | Init Fwd Win Bytes | Init Bwd Win Bytes | Fwd Seg Size Min | Bwd IAT Mean | |
|---|---|---|---|---|---|
| 0 | 5968290 | 219.0 | 211.0 | 32.0 | 27159.000000 |
| 1 | 8285216 | 63326.0 | 235.0 | 20.0 | 0.000000 |
| 2 | 8349977 | 8192.0 | 62856.0 | 20.0 | 297148.500000 |
| 3 | 7180832 | 8192.0 | 235.0 | 0.0 | 0.000000 |
| 4 | 2324438 | 8192.0 | 123.0 | 20.0 | 6344.799805 |
| ... | ... | ... | ... | ... | ... |
| 1833449 | 1606912 | 8192.0 | 31.0 | 20.0 | 15629.142578 |
| 1833450 | 7433839 | 8192.0 | 235.0 | 20.0 | 0.000000 |
| 1833451 | 2510144 | 8192.0 | 16625.0 | 0.0 | 0.000000 |
| 1833452 | 760618 | 279.0 | 235.0 | 20.0 | 0.000000 |
| 1833453 | 7134908 | 8192.0 | 235.0 | 20.0 | 0.000000 |
1833454 rows × 5 columns
from sklearn.preprocessing import RobustScaler
robust_scaler=RobustScaler()
robust_scaler.fit(robust_scaler_test)
robust_scaled_data=robust_scaler.transform(robust_scaler_test)
type(robust_scaled_data)
numpy.ndarray
robust_scaled_data
array([[-7.97300000e+03, -6.00000000e-01, 1.20000000e+01,
1.00513250e-01],
[ 5.51340000e+04, 0.00000000e+00, 0.00000000e+00,
-2.45292973e-03],
[ 0.00000000e+00, 1.56552500e+03, 0.00000000e+00,
1.12410718e+00],
...,
[ 0.00000000e+00, 4.09750000e+02, -2.00000000e+01,
-2.45292973e-03],
[-7.91300000e+03, 0.00000000e+00, 0.00000000e+00,
-2.45292973e-03],
[ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
-2.45292973e-03]])
columns = robust_scaler_test.columns
# Creating a DataFrame from the scaled data
robust_scaled_df=pd.DataFrame(data=robust_scaled_data, columns=columns)
robust_scaled_df.head()
| Init Fwd Win Bytes | Init Bwd Win Bytes | Fwd Seg Size Min | Bwd IAT Mean | |
|---|---|---|---|---|
| 0 | -7973.0 | -0.600 | 12.0 | 0.100513 |
| 1 | 55134.0 | 0.000 | 0.0 | -0.002453 |
| 2 | 0.0 | 1565.525 | 0.0 | 1.124107 |
| 3 | 0.0 | 0.000 | -20.0 | -0.002453 |
| 4 | 0.0 | -2.800 | 0.0 | 0.021602 |
robust_scaled_df.tail()
| Init Fwd Win Bytes | Init Bwd Win Bytes | Fwd Seg Size Min | Bwd IAT Mean | |
|---|---|---|---|---|
| 1833449 | 0.0 | -5.10 | 0.0 | 0.056801 |
| 1833450 | 0.0 | 0.00 | 0.0 | -0.002453 |
| 1833451 | 0.0 | 409.75 | -20.0 | -0.002453 |
| 1833452 | -7913.0 | 0.00 | 0.0 | -0.002453 |
| 1833453 | 0.0 | 0.00 | 0.0 | -0.002453 |
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
ax1.set_xlabel("Init Fwd Win Bytes")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior scaling")
ax2=plt.subplot(2,1,2)
ax2.hist(robust_scaled_df["Init Fwd Win Bytes"], edgecolor="black", log=True)
ax2.set_xlabel("Init Fwd Win Bytes")
ax2.set_ylabel("Frequency")
ax2.set_title("After scaling")
plt.tight_layout()
plt.show()
print("Maximum value after scaling: ",robust_scaled_df["Init Fwd Win Bytes"].max())
print("Minimum value after scaling: ",robust_scaled_df["Init Fwd Win Bytes"].min())
print("Median value after scaling: ",robust_scaled_df["Init Fwd Win Bytes"].median())
print("Standard deviation value after scaling: ",robust_scaled_df["Init Fwd Win Bytes"].std())
Maximum value after scaling: 57343.0 Minimum value after scaling: -8192.0 Median value after scaling: 0.0 Standard deviation value after scaling: 17920.85671285462
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
ax1.set_xlabel("Init Bwd Win Bytes")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior scaling")
ax2=plt.subplot(2,1,2)
ax2.hist(robust_scaled_df["Init Bwd Win Bytes"], edgecolor="black", log=True)
ax2.set_xlabel("Init Bwd Win Bytes")
ax2.set_ylabel("Frequency")
ax2.set_title("After scaling")
plt.tight_layout()
plt.show()
print("Maximum value after scaling: ",robust_scaled_df["Init Bwd Win Bytes"].max())
print("Minimum value after scaling: ",robust_scaled_df["Init Bwd Win Bytes"].min())
print("Median value after scaling: ",robust_scaled_df["Init Bwd Win Bytes"].median())
print("Standard deviation value after scaling: ",robust_scaled_df["Init Bwd Win Bytes"].std())
Maximum value after scaling: 1632.5 Minimum value after scaling: -5.875 Median value after scaling: 0.0 Standard deviation value after scaling: 485.3711778160208
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Fwd Seg Size Min"], edgecolor="black", log=True)
ax1.set_xlabel("Fwd Seg Size Min")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior scaling")
ax2=plt.subplot(2,1,2)
ax2.hist(robust_scaled_df["Fwd Seg Size Min"], edgecolor="black", log=True)
ax2.set_xlabel("Fwd Seg Size Min")
ax2.set_ylabel("Frequency")
ax2.set_title("After scaling")
plt.tight_layout()
plt.show()
print("Maximum value after scaling: ",robust_scaled_df["Fwd Seg Size Min"].max())
print("Minimum value after scaling: ",robust_scaled_df["Fwd Seg Size Min"].min())
print("Median value after scaling: ",robust_scaled_df["Fwd Seg Size Min"].median())
print("Standard deviation value after scaling: ",robust_scaled_df["Fwd Seg Size Min"].std())
Maximum value after scaling: 1460.0 Minimum value after scaling: -20.0 Median value after scaling: 0.0 Standard deviation value after scaling: 25.971709364062733
fig1=plt.figure()
ax1=plt.subplot(2,1,1)
ax1.hist(sampled_cic_df["Bwd IAT Mean"], edgecolor="black", log=True)
ax1.set_xlabel("Bwd IAT Mean")
ax1.set_ylabel("Frequency")
ax1.set_title("Prior scaling")
ax2=plt.subplot(2,1,2)
ax2.hist(robust_scaled_df["Bwd IAT Mean"], edgecolor="black", log=True)
ax2.set_xlabel("Bwd IAT Mean")
ax2.set_ylabel("Frequency")
ax2.set_title("After scaling")
plt.tight_layout()
plt.show()
print("Maximum value after scaling: ",robust_scaled_df["Bwd IAT Mean"].max())
print("Minimum value after scaling: ",robust_scaled_df["Bwd IAT Mean"].min())
print("Median value after scaling: ",robust_scaled_df["Bwd IAT Mean"].median())
print("Standard deviation value after scaling: ",robust_scaled_df["Bwd IAT Mean"].std())
Maximum value after scaling: 454.94587429990975 Minimum value after scaling: -0.002452929730979813 Median value after scaling: 0.0 Standard deviation value after scaling: 23.4755026768653
In RobustScaler, we observed the model transforms data such that it generates negative values. It may be due to skewed nature of the features.
However, in our model, we cannot use negative values.
Thus, we shall not employ RobustScaler to scale data for handling outliers in features.
#Performing imputation on all features except for the above 4 features.
features_to_impute=sampled_cic_df.columns.tolist()
print(features_to_impute)
features_to_impute.remove('Init Fwd Win Bytes')
features_to_impute.remove('Init Bwd Win Bytes')
features_to_impute.remove('Fwd Seg Size Min')
features_to_impute.remove('Bwd IAT Mean')
features_to_impute.remove('ClassLabel')
features_to_impute.remove('isMalicious')
#print(features_to_impute)
['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total', 'Bwd Packets Length Total', 'Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'URG Flag Count', 'Avg Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init Fwd Win Bytes', 'Init Bwd Win Bytes', 'Fwd Act Data Packets', 'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'ClassLabel', 'isMalicious']
imputed_sample_df=sampled_cic_df.copy()
imputed_sample_df.head()
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Active Mean | Active Std | Active Max | Active Min | Idle Mean | Idle Std | Idle Max | Idle Min | ClassLabel | isMalicious | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5 | 3 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 8285216 | 88077383.0 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 88077384.0 | 0.0 | 88077383.0 | 88077383.0 | Benign | 0 |
| 8349977 | 1914354.0 | 8 | 7 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 7180832 | 4002.0 | 6 | 0 | 2064.0 | 0.0 | 440.0 | 344.000 | 148.722565 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | DDoS | 1 |
| 2324438 | 5368715.0 | 8 | 6 | 355.0 | 3292.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 548.666687 | ... | 102340.0 | 0.0 | 102340.0 | 102340.0 | 5266340.0 | 0.0 | 5266340.0 | 5266340.0 | Benign | 0 |
5 rows × 59 columns
for col in features_to_impute:
Q1 = imputed_sample_df[col].quantile(0.25)
Q3 = imputed_sample_df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Replacing outliers with the median
median_value = imputed_sample_df[col].median()
imputed_sample_df[col] = np.where((imputed_sample_df[col] < lower_bound) | (imputed_sample_df[col] > upper_bound), median_value, imputed_sample_df[col])
imputed_sample_df.head(10)
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Active Mean | Active Std | Active Max | Active Min | Idle Mean | Idle Std | Idle Max | Idle Min | ClassLabel | isMalicious | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 8285216 | 397660.5 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | DDoS | 1 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 8791292 | 24810.0 | 1.0 | 1.0 | 35.0 | 51.0 | 35.0 | 35.000 | 0.000000 | 51.0 | 51.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 2785928 | 84462.0 | 1.0 | 1.0 | 48.0 | 48.0 | 48.0 | 48.000 | 0.000000 | 48.0 | 48.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 2360459 | 397660.5 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
| 6152798 | 45697.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | DoS | 1 |
| 1899583 | 80938.0 | 4.0 | 3.0 | 436.0 | 788.0 | 436.0 | 109.000 | 218.000000 | 788.0 | 262.666656 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Benign | 0 |
10 rows × 59 columns
for col in features_to_impute:
print("Feature name: ",col)
print("Mean: ",imputed_sample_df[col].mean())
print("Median: ",imputed_sample_df[col].median())
print("Maximum: ",imputed_sample_df[col].max())
print("Minimum: ",imputed_sample_df[col].min())
print("Standard deviation: ",imputed_sample_df[col].std())
print("\n")
Feature name: Flow Duration Mean: 1213665.5297534054 Median: 397660.25 Maximum: 13875506.0 Minimum: 1.0 Standard deviation: 2159655.192222251 Feature name: Total Fwd Packets Mean: 3.9682713610486 Median: 3.0 Maximum: 14.0 Minimum: 0.0 Standard deviation: 3.0508858389290645 Feature name: Total Backward Packets Mean: 2.7108163062722053 Median: 2.0 Maximum: 11.0 Minimum: 0.0 Standard deviation: 2.510281941244621 Feature name: Fwd Packets Length Total Mean: 395.65305265362537 Median: 97.0 Maximum: 2292.0 Minimum: 0.0 Standard deviation: 522.316758487006 Feature name: Bwd Packets Length Total Mean: 351.8448387578854 Median: 232.0 Maximum: 2410.0 Minimum: 0.0 Standard deviation: 484.0204739171949 Feature name: Fwd Packet Length Max Mean: 259.6825521665665 Median: 55.0 Maximum: 1235.0 Minimum: 0.0 Standard deviation: 319.60892275413545 Feature name: Fwd Packet Length Mean Mean: 59.869995 Median: 44.0 Maximum: 258.66666 Minimum: 0.0 Standard deviation: 58.978123 Feature name: Fwd Packet Length Std Mean: 92.038086 Median: 11.547006 Maximum: 451.74237 Minimum: 0.0 Standard deviation: 130.14272 Feature name: Bwd Packet Length Max Mean: 403.9789599302737 Median: 152.0 Maximum: 2410.0 Minimum: 0.0 Standard deviation: 508.52023457970176 Feature name: Bwd Packet Length Mean Mean: 108.74675 Median: 108.0 Maximum: 540.9375 Minimum: 0.0 Standard deviation: 103.59248 Feature name: Bwd Packet Length Std Mean: 163.04944 Median: 0.0 Maximum: 1013.63153 Minimum: 0.0 Standard deviation: 222.16183 Feature name: Flow Bytes/s Mean: 3909.036427263739 Median: 996.359843875 Maximum: 67824.6484698098 Minimum: 0.0 Standard deviation: 9988.548556992235 Feature name: Flow Packets/s Mean: 66.50042504069462 Median: 16.13210581515 Maximum: 1242.015625 Minimum: 0.016667174321018 Standard deviation: 172.02090506062973 Feature name: Flow IAT Mean Mean: 176059.79187172325 Median: 82687.748046875 Maximum: 1968061.25 Minimum: 0.3333333432674408 Standard deviation: 326493.8396342 Feature name: Flow IAT Std Mean: 210712.12 Median: 18652.17 Maximum: 2091517.5 Minimum: 0.0 Standard deviation: 469940.1 Feature name: Flow IAT Max Mean: 1481385.0418025213 Median: 225017.25 Maximum: 12737357.0 Minimum: 1.0 Standard deviation: 2794362.274549001 Feature name: Flow IAT Min Mean: 50.33824410102462 Median: 14.0 Maximum: 1170.0 Minimum: 0.0 Standard deviation: 149.01435145238835 Feature name: Fwd IAT Total Mean: 892652.4578448655 Median: 72909.25 Maximum: 11725868.0 Minimum: 0.0 Standard deviation: 1875030.9235689652 Feature name: Fwd IAT Mean Mean: 182948.53978239992 Median: 28688.02490234375 Maximum: 2684606.0 Minimum: 0.0 Standard deviation: 410226.2879792003 Feature name: Fwd IAT Std Mean: 46698.184 Median: 454.50018 Maximum: 999041.94 Minimum: 0.0 Standard deviation: 130210.86 Feature name: Fwd IAT Max Mean: 1212594.4175294281 Median: 61635.0 Maximum: 10556757.0 Minimum: 0.0 Standard deviation: 2622169.341081965 Feature name: Fwd IAT Min Mean: 65.34727787007473 Median: 36.0 Maximum: 1144.0 Minimum: 0.0 Standard deviation: 131.08000116496532 Feature name: Bwd IAT Total Mean: 203173.0466682011 Median: 732.0 Maximum: 3138539.0 Minimum: 0.0 Standard deviation: 534712.8396043193 Feature name: Bwd IAT Std Mean: 45497.36 Median: 0.0 Maximum: 705338.44 Minimum: 0.0 Standard deviation: 123579.555 Feature name: Bwd IAT Max Mean: 162097.3564518117 Median: 709.0 Maximum: 2382694.0 Minimum: 0.0 Standard deviation: 390905.02938092116 Feature name: Bwd IAT Min Mean: 36.23238488666746 Median: 3.0 Maximum: 767.0 Minimum: 0.0 Standard deviation: 113.35976821326203 Feature name: Fwd PSH Flags Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Fwd Header Length Mean: 94.90806205118864 Median: 72.0 Maximum: 360.0 Minimum: 0.0 Standard deviation: 79.50237485349817 Feature name: Bwd Header Length Mean: 68.0338039569032 Median: 60.0 Maximum: 328.0 Minimum: 0.0 Standard deviation: 67.83078348929288 Feature name: Fwd Packets/s Mean: 35.18084 Median: 8.57913 Maximum: 671.32117 Minimum: 0.0 Standard deviation: 90.893295 Feature name: Bwd Packets/s Mean: 11.960576 Median: 3.3275118 Maximum: 192.61328 Minimum: 0.0 Standard deviation: 25.856539 Feature name: Packet Length Max Mean: 488.07187526930045 Median: 232.0 Maximum: 2341.0 Minimum: 0.0 Standard deviation: 519.0516918125927 Feature name: Packet Length Mean Mean: 83.624954 Median: 78.666664 Maximum: 342.30768 Minimum: 0.0 Standard deviation: 68.9974 Feature name: Packet Length Std Mean: 155.90248 Median: 73.90083 Maximum: 785.4239 Minimum: 0.0 Standard deviation: 176.20068 Feature name: Packet Length Variance Mean: 36721.277 Median: 5461.3335 Maximum: 254975.14 Minimum: 0.0 Standard deviation: 54679.88 Feature name: SYN Flag Count Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: URG Flag Count Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Avg Packet Size Mean: 97.50851 Median: 99.5 Maximum: 372.66666 Minimum: 0.0 Standard deviation: 75.065735 Feature name: Avg Fwd Segment Size Mean: 59.869995 Median: 44.0 Maximum: 258.66666 Minimum: 0.0 Standard deviation: 58.978123 Feature name: Avg Bwd Segment Size Mean: 108.74675 Median: 108.0 Maximum: 540.9375 Minimum: 0.0 Standard deviation: 103.59248 Feature name: Subflow Fwd Packets Mean: 3.9682713610486 Median: 3.0 Maximum: 14.0 Minimum: 0.0 Standard deviation: 3.0508858389290645 Feature name: Subflow Fwd Bytes Mean: 395.65305265362537 Median: 97.0 Maximum: 2292.0 Minimum: 0.0 Standard deviation: 522.316758487006 Feature name: Subflow Bwd Packets Mean: 2.7108163062722053 Median: 2.0 Maximum: 11.0 Minimum: 0.0 Standard deviation: 2.510281941244621 Feature name: Subflow Bwd Bytes Mean: 351.8448387578854 Median: 232.0 Maximum: 2410.0 Minimum: 0.0 Standard deviation: 484.0204739171949 Feature name: Fwd Act Data Packets Mean: 1.7136301210720313 Median: 1.0 Maximum: 10.0 Minimum: 0.0 Standard deviation: 2.1627014579408916 Feature name: Active Mean Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Active Std Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Active Max Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Active Min Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Idle Mean Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Idle Std Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Idle Max Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0 Feature name: Idle Min Mean: 0.0 Median: 0.0 Maximum: 0.0 Minimum: 0.0 Standard deviation: 0.0
We shall perform handling of outliers on the main dataset after testing the approaches on different features of sample dataset.
For following 4 features, we shall perform Winsorization: -
Init Fwd Win Bytes
Init Bwd Win Bytes
Fwd Seg Size Min
Bwd IAT Mean
- These 4 features have higher number of outliers, most of them being labelled as Benign.
- Thus, by winsorization, we try to reduce the influence of outliers by handling extreme values.
- Outliers also may indicate noisy data.
- Since the above 4 features have large number of outliers, there is higher likelihood for occurrence of noisy data in those features.
- Thus, winsorization will enable us to reduce the noise in the above features.
For remaining features, we shall perform imputation with median value.
- Since the number of outliers in these features are very less, imputing them with median value will help to approximate the entries having outliers.
- Most of our features are skewed, as the result, we perform imputation of outliers with respective Median values.
This process will help us ensure that all outliers are handled, and we do not delete any rows causing loss of data.
#Imputation of outliers with median
for col in features_to_impute:
Q1 = cic_df[col].quantile(0.25)
Q3 = cic_df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Replacing outliers with the median
median_value = cic_df[col].median()
cic_df[col] = np.where((cic_df[col] < lower_bound) | (cic_df[col] > upper_bound), median_value, cic_df[col])
#Winsorization of outliers
features_to_cap=['Init Fwd Win Bytes','Init Bwd Win Bytes','Fwd Seg Size Min','Bwd IAT Mean']
for col in features_to_cap:
upper_limit=cic_df[col].quantile(0.95)
lower_limit=cic_df[col].quantile(0.05)
cic_df[col]=np.where(cic_df[col]>=upper_limit,
upper_limit,
np.where(cic_df[col]<=lower_limit,
lower_limit,
cic_df[col]))
del sampled_cic_df
sample_size=int(0.2*len(cic_df))
sampled_cic_df=cic_df.sample(n=sample_size, replace=False, random_state=42)
sampled_cic_df.shape
(1833454, 59)
#Plotting on sampled dataset with log scale
# Getting the list of columns excluding 'isMalicious' and 'ClassLabel'
columns = [col for col in cic_df.columns if col not in ['isMalicious', 'ClassLabel']]
# Creating subplots for each column
fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=(10, len(columns) * 5))
# Plot each column's histogram in a separate subplot
for i, column in enumerate(columns):
sampled_cic_df[column].hist(bins=50, ax=axes[i], log=True)
axes[i].set_xlabel("Values")
axes[i].set_ylabel("Frequency w.r.t log scale")
axes[i].set_title(column)
plt.tight_layout()
plt.show()
- In the above histograms, we observed there are some features with single value.
- Such features will not help us train the classifier because irrespective of type of event, those feature values will remain unchanged.
unique_columns=[col for col in sampled_cic_df.columns if sampled_cic_df[col].nunique() == 1]
print("Features with single value: ",unique_columns)
Features with single value: ['Fwd PSH Flags', 'SYN Flag Count', 'URG Flag Count', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
#We will check if the above list remains same for main dataset.
unique_columns=[col for col in cic_df.columns if cic_df[col].nunique() == 1]
print("Features with single value: ",unique_columns)
Features with single value: ['Fwd PSH Flags', 'SYN Flag Count', 'URG Flag Count', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
#We shall drop the above features from our dataset as they will not help in training our model.
sampled_cic_df=sampled_cic_df.drop(unique_columns,axis=1)
cic_df=cic_df.drop(unique_columns,axis=1)
print("Shape of main dataset: ",cic_df.shape)
print("Shape of sampled dataset: ",sampled_cic_df.shape)
Shape of main dataset: (9167271, 48) Shape of sampled dataset: (1833454, 48)
sampled_cic_df.head()
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Subflow Fwd Packets | Subflow Fwd Bytes | Subflow Bwd Packets | Subflow Bwd Bytes | Init Fwd Win Bytes | Init Bwd Win Bytes | Fwd Act Data Packets | Fwd Seg Size Min | ClassLabel | isMalicious | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 5.0 | 935.0 | 3.0 | 397.0 | 219.0 | 211.0 | 1.0 | 32.0 | Benign | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 2.0 | 0.0 | 0.0 | 0.0 | 63326.0 | 235.0 | 0.0 | 20.0 | Benign | 0 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 8.0 | 1144.0 | 7.0 | 1581.0 | 8192.0 | 62856.0 | 5.0 | 20.0 | Benign | 0 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 6.0 | 2064.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 5.0 | 8.0 | DDoS | 1 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 8.0 | 355.0 | 6.0 | 232.0 | 8192.0 | 123.0 | 3.0 | 20.0 | Benign | 0 |
5 rows × 48 columns
sampled_cic_df.tail()
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Subflow Fwd Packets | Subflow Fwd Bytes | Subflow Bwd Packets | Subflow Bwd Bytes | Init Fwd Win Bytes | Init Bwd Win Bytes | Fwd Act Data Packets | Fwd Seg Size Min | ClassLabel | isMalicious | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1606912 | 189583.0 | 10.0 | 8.0 | 496.0 | 232.0 | 192.0 | 49.599998 | 77.654793 | 1460.0 | 108.0 | ... | 10.0 | 496.0 | 8.0 | 232.0 | 8192.0 | 31.0 | 4.0 | 20.0 | Benign | 0 |
| 7433839 | 3000787.0 | 4.0 | 0.0 | 2064.0 | 0.0 | 516.0 | 44.000000 | 0.000000 | 0.0 | 0.0 | ... | 4.0 | 2064.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 3.0 | 20.0 | DDoS | 1 |
| 2510144 | 40.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 8192.0 | 16625.0 | 0.0 | 8.0 | Benign | 0 |
| 760618 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 2.0 | 0.0 | 0.0 | 0.0 | 279.0 | 235.0 | 0.0 | 20.0 | Benign | 0 |
| 7134908 | 4097.0 | 3.0 | 0.0 | 97.0 | 0.0 | 440.0 | 44.000000 | 27.430199 | 0.0 | 0.0 | ... | 3.0 | 97.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 1.0 | 20.0 | DDoS | 1 |
5 rows × 48 columns
import math
#Creating a copy of sampled dataset to plot pyramid chart
pyramid_sampled_df=sampled_cic_df.copy()
pyramid_sampled_df.shape
(1833454, 48)
pyramid_sampled_df=pyramid_sampled_df.drop('ClassLabel',axis=1)
pyramid_sampled_df.shape
(1833454, 47)
n=pyramid_sampled_df.shape[0]
print("n: ",n)
n: 1833454
column_list=pyramid_sampled_df.columns.tolist()
for col in column_list:
try:
p25, p75 = np.percentile(pyramid_sampled_df[col], [25, 75])
print("Feature name: ",col)
print("p25: ",p25)
print("p75: ",p75)
width=2.*(p75-p25)/n**(1./3)
# Creating bins using the calculated width
pyramid_sampled_df[col] = pd.cut(pyramid_sampled_df[col], bins=np.arange(pyramid_sampled_df[col].min(), pyramid_sampled_df[col].max() + width, width))
# Grouping data by bins and isMalicious (target festure)
grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
# Calculate normalized frequencies for each bin and class
normalized_data=grouped_data.div(grouped_data.sum(axis=1), axis=0)
# Create the pyramid chart using matplotlib
fig, ax = plt.subplots()
# Plot the bars for each class in opposite directions
ax.bar(normalized_data.index.astype(str), normalized_data[0], width=0.8, align='center', color='lightblue', label="Benign")
ax.bar(normalized_data.index.astype(str), -normalized_data[1], width=0.8, align='center', color='lightcoral', label="Malicious")
# Customize the plot
ax.set_xlabel(col)
ax.set_ylabel('Normalized Frequency')
ax.set_title(col)
ax.legend()
# Show the plot
plt.show()
except ValueError:
pass
Feature name: Flow Duration p25: 11621.0 p75: 1273497.25
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Total Fwd Packets p25: 2.0 p75: 5.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Total Backward Packets p25: 1.0 p75: 4.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd Packets Length Total p25: 30.0 p75: 858.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd Packets Length Total p25: 0.0 p75: 338.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd Packet Length Max p25: 20.0 p75: 440.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd Packet Length Mean p25: 7.0 p75: 91.2727279663086
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd Packet Length Std p25: 0.0 p75: 168.00892639160156
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd Packet Length Max p25: 0.0 p75: 859.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd Packet Length Mean p25: 0.0 p75: 161.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd Packet Length Std p25: 0.0 p75: 284.55936431884766
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Flow Bytes/s p25: 55.825086049175 p75: 1456.5540008269675
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Flow Packets/s p25: 1.46565167235 p75: 32.86284691
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Flow IAT Mean p25: 2578.0 p75: 148040.30859375
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Flow IAT Std p25: 0.0 p75: 112470.814453125
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Flow IAT Max p25: 10618.0 p75: 984370.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Flow IAT Min p25: 3.0 p75: 21.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd IAT Total p25: 285.0 p75: 323869.75
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd IAT Mean p25: 136.0 p75: 81006.75
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd IAT Std p25: 0.0 p75: 2294.5615234375
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack() C:\Users\pc\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. fig.canvas.print_figure(bytes_io, **kw)
Feature name: Fwd IAT Max p25: 206.0 p75: 997940.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd IAT Min p25: 2.0 p75: 46.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd IAT Total p25: 0.0 p75: 25579.5
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd IAT Mean p25: 0.0 p75: 263766.21875
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd IAT Std p25: 0.0 p75: 4926.3211669921875
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd IAT Max p25: 0.0 p75: 36136.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd IAT Min p25: 0.0 p75: 4.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd Header Length p25: 40.0 p75: 136.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd Header Length p25: 8.0 p75: 104.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd Packets/s p25: 0.878700390458107 p75: 17.483688831329346
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Bwd Packets/s p25: 0.14235177636146545 p75: 4.386377453804016
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Packet Length Max p25: 46.0 p75: 935.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Packet Length Mean p25: 30.75 p75: 137.82608032226562
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Packet Length Std p25: 8.763561248779297 p75: 317.73126220703125
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Packet Length Variance p25: 76.80000305175781 p75: 89114.359375
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Avg Packet Size p25: 41.0 p75: 151.09524536132812
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Avg Fwd Segment Size p25: 7.0 p75: 91.2727279663086
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Avg Bwd Segment Size p25: 0.0 p75: 161.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Subflow Fwd Packets p25: 2.0 p75: 5.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Subflow Fwd Bytes p25: 30.0 p75: 858.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Subflow Bwd Packets p25: 1.0 p75: 4.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Subflow Bwd Bytes p25: 0.0 p75: 338.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Init Fwd Win Bytes p25: 8192.0 p75: 8192.0 Feature name: Init Bwd Win Bytes p25: 219.0 p75: 259.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:9: RuntimeWarning: divide by zero encountered in double_scalars pyramid_sampled_df[col] = pd.cut(pyramid_sampled_df[col], bins=np.arange(pyramid_sampled_df[col].min(), pyramid_sampled_df[col].max() + width, width)) C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack() C:\Users\pc\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. fig.canvas.print_figure(bytes_io, **kw)
Feature name: Fwd Act Data Packets p25: 0.0 p75: 2.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data=pyramid_sampled_df.groupby([col, 'isMalicious']).size().unstack()
Feature name: Fwd Seg Size Min p25: 20.0 p75: 20.0 Feature name: isMalicious p25: 0.0 p75: 0.0
C:\Users\pc\AppData\Local\Temp\ipykernel_3560\4091346683.py:9: RuntimeWarning: divide by zero encountered in double_scalars pyramid_sampled_df[col] = pd.cut(pyramid_sampled_df[col], bins=np.arange(pyramid_sampled_df[col].min(), pyramid_sampled_df[col].max() + width, width))
We selected Freedman-Diaconis Rule for computing the number of bins for each feature to plot the charts.
The above rule helps to compute bin width based on each feature's IQR. Thus, it helps to reduce the impact of skewness in data, does not assume the feature to be normally distributed. Since it uses IQR, it is effecitively handles potential outliers in the data.
Following features have almost equal number of Malicious and Benign records in most of the bins: -
- Flow Duration
- Flow IAT Max
- Fwd Header Length
Following features have some bins where number of Malicious records are relatively more then number of Benign records, and we observed change in pattern: -
- Flow Bytes/s
- Flow Packets/s
- Flow IAT Std
- Fwd IAT Max
- Bwd IAT Std
- Bwd IAT Max
- Fwd Packets/s
- Bwd Packets/s
"Init Bwd Win Bytes" was a rare feature which had only 1 bin with Malicious records, rest all bins had Benign records.
Remaining all features have relatively very high number of Benign records compared to Malicious records in most of the bins.
While carrying out the interpretation, small variations and changes were not recorded, as decisions based on minor interpretations may result in incorrect analysis. Only the patterns which are thick and broadly visible were recorded from the above Pyramid charts plotted with respect to the binary target feature: isMalicious.
#Encoding the values in field: ClassLabel and thus, we will get a unique numerical identifier for each type of attack
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
sampled_cic_df["attack_id"]=le.fit_transform(sampled_cic_df["ClassLabel"])
print("Attack id: ",sampled_cic_df["attack_id"].unique())
Attack id: [0 3 4 5 1 2 6 7]
sampled_cic_df.head()
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Subflow Fwd Bytes | Subflow Bwd Packets | Subflow Bwd Bytes | Init Fwd Win Bytes | Init Bwd Win Bytes | Fwd Act Data Packets | Fwd Seg Size Min | ClassLabel | isMalicious | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 935.0 | 3.0 | 397.0 | 219.0 | 211.0 | 1.0 | 32.0 | Benign | 0 | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 63326.0 | 235.0 | 0.0 | 20.0 | Benign | 0 | 0 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 1144.0 | 7.0 | 1581.0 | 8192.0 | 62856.0 | 5.0 | 20.0 | Benign | 0 | 0 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 2064.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 5.0 | 8.0 | DDoS | 1 | 3 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 355.0 | 6.0 | 232.0 | 8192.0 | 123.0 | 3.0 | 20.0 | Benign | 0 | 0 |
5 rows × 49 columns
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Attack id of each distinct value in field ClassLabel:", label_mapping)
Attack id of each distinct value in field ClassLabel: {'Benign': 0, 'Botnet': 1, 'Bruteforce': 2, 'DDoS': 3, 'DoS': 4, 'Infiltration': 5, 'Portscan': 6, 'Webattack': 7}
corr_sample_size=int(0.2*len(sampled_cic_df))
corr_df=sampled_cic_df.sample(n=corr_sample_size, replace=False, random_state=42)
type(corr_df)
pandas.core.frame.DataFrame
#Dropping the columns: ClassLabel, isMalicious from the dataframe
corr_df=corr_df.drop(['isMalicious','ClassLabel'],axis=1)
corr_df.dtypes
Flow Duration float64 Total Fwd Packets float64 Total Backward Packets float64 Fwd Packets Length Total float64 Bwd Packets Length Total float64 Fwd Packet Length Max float64 Fwd Packet Length Mean float32 Fwd Packet Length Std float32 Bwd Packet Length Max float64 Bwd Packet Length Mean float32 Bwd Packet Length Std float32 Flow Bytes/s float64 Flow Packets/s float64 Flow IAT Mean float64 Flow IAT Std float32 Flow IAT Max float64 Flow IAT Min float64 Fwd IAT Total float64 Fwd IAT Mean float64 Fwd IAT Std float32 Fwd IAT Max float64 Fwd IAT Min float64 Bwd IAT Total float64 Bwd IAT Mean float32 Bwd IAT Std float32 Bwd IAT Max float64 Bwd IAT Min float64 Fwd Header Length float64 Bwd Header Length float64 Fwd Packets/s float32 Bwd Packets/s float32 Packet Length Max float64 Packet Length Mean float32 Packet Length Std float32 Packet Length Variance float32 Avg Packet Size float32 Avg Fwd Segment Size float32 Avg Bwd Segment Size float32 Subflow Fwd Packets float64 Subflow Fwd Bytes float64 Subflow Bwd Packets float64 Subflow Bwd Bytes float64 Init Fwd Win Bytes float64 Init Bwd Win Bytes float64 Fwd Act Data Packets float64 Fwd Seg Size Min float64 attack_id int32 dtype: object
corr_df.shape
(366690, 47)
corr_df.head()
| Flow Duration | Total Fwd Packets | Total Backward Packets | Fwd Packets Length Total | Bwd Packets Length Total | Fwd Packet Length Max | Fwd Packet Length Mean | Fwd Packet Length Std | Bwd Packet Length Max | Bwd Packet Length Mean | ... | Avg Bwd Segment Size | Subflow Fwd Packets | Subflow Fwd Bytes | Subflow Bwd Packets | Subflow Bwd Bytes | Init Fwd Win Bytes | Init Bwd Win Bytes | Fwd Act Data Packets | Fwd Seg Size Min | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3357734 | 13523961.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2049.0 | 235.0 | 0.0 | 20.0 | 3 |
| 7460867 | 20572.0 | 2.0 | 2.0 | 90.0 | 172.0 | 45.0 | 45.000000 | 0.000000 | 86.0 | 86.0 | ... | 86.0 | 2.0 | 90.0 | 2.0 | 172.0 | 8192.0 | 235.0 | 1.0 | 20.0 | 0 |
| 2952728 | 1374475.0 | 3.0 | 4.0 | 20.0 | 964.0 | 20.0 | 6.666667 | 11.547006 | 964.0 | 241.0 | ... | 241.0 | 3.0 | 20.0 | 4.0 | 964.0 | 8192.0 | 211.0 | 1.0 | 20.0 | 3 |
| 7243433 | 84231.0 | 4.0 | 2.0 | 168.0 | 414.0 | 42.0 | 42.000000 | 0.000000 | 207.0 | 207.0 | ... | 207.0 | 4.0 | 168.0 | 2.0 | 414.0 | 8192.0 | 235.0 | 3.0 | 20.0 | 0 |
| 4133394 | 1378.0 | 5.0 | 2.0 | 935.0 | 309.0 | 935.0 | 187.000000 | 418.144714 | 309.0 | 154.5 | ... | 154.5 | 5.0 | 935.0 | 2.0 | 309.0 | 65535.0 | 32768.0 | 1.0 | 20.0 | 0 |
5 rows × 47 columns
# Computing the correlation matrix
corr_matrix = corr_df.corr()
fig, ax = plt.subplots(figsize=(40, 40))
# Creating the heatmap
cax = ax.matshow(corr_matrix, cmap="coolwarm")
# Adding color bar
fig.colorbar(cax)
# Setting ticks and labels
ax.set_xticks(np.arange(len(corr_df.columns)))
ax.set_yticks(np.arange(len(corr_df.columns)))
ax.set_xticklabels(corr_df.columns, rotation=90)
ax.set_yticklabels(corr_df.columns)
# Adding the correlation coefficients as text on the heatmap
for i in range(len(corr_df.columns)):
for j in range(len(corr_df.columns)):
text = ax.text(j, i, round(corr_matrix.iloc[i, j], 2),
ha="center", va="center", color="black", fontsize=10)
plt.title("Correlation Matrix Heatmap w.r.t attack_id", fontsize=50)
plt.show()
If we take the original sampled dataset: sampled_cic_df which was compiled by taking 20% of records from original dataset, and try to plot the correlation matrix as above, we get error for insufficient memory.
As the result, from sampled dataset, we took another sample: corr_df by taking 20% of records, and fetched the results for correlation among features and with the target feature: attack_id.
From the above results, we observed as all features against the target have blue color squares, all features have weak relation with the target feature.
However, there are many independent features having red and dark red squares which indicates there is strong relation among some of the independent features of the sub-sampled dataset.
Some of the examples are: -
- Fwd Packet Length Max - Fwd Packet Length Mean = 0.88
- Fwd Packet Length Max - Fwd Packet Length Std = 0.91
- Bwd Packet Length Std - Packet Length Max = 0.91
- Bwd Packet Length Std = Packet Length Mean = 0.71
stdev=sampled_cic_df.std(numeric_only=True)
zero_std_cols= stdev[stdev == 0].index.tolist()
print("Features with zero standard deviation in sample dataset: ",zero_std_cols)
Features with zero standard deviation in sample dataset: []
print(corr_matrix)
Flow Duration Total Fwd Packets \
Flow Duration 1.000000 0.344572
Total Fwd Packets 0.344572 1.000000
Total Backward Packets 0.284508 0.693930
Fwd Packets Length Total 0.266345 0.580110
Bwd Packets Length Total 0.227557 0.536628
Fwd Packet Length Max 0.291309 0.535619
Fwd Packet Length Mean 0.223883 0.443021
Fwd Packet Length Std 0.258275 0.460252
Bwd Packet Length Max 0.193896 0.559189
Bwd Packet Length Mean 0.069250 0.389019
Bwd Packet Length Std 0.208548 0.526333
Flow Bytes/s -0.168463 -0.056336
Flow Packets/s -0.201210 -0.142290
Flow IAT Mean 0.527404 0.155158
Flow IAT Std 0.565126 0.229385
Flow IAT Max 0.530732 0.198171
Flow IAT Min -0.087898 -0.165506
Fwd IAT Total 0.780641 0.396678
Fwd IAT Mean 0.518293 0.244505
Fwd IAT Std 0.262062 0.526483
Fwd IAT Max 0.359788 0.225066
Fwd IAT Min 0.042262 0.121484
Bwd IAT Total 0.124080 0.324204
Bwd IAT Mean 0.020241 0.192164
Bwd IAT Std 0.206601 0.448569
Bwd IAT Max 0.209195 0.410429
Bwd IAT Min 0.110107 0.098966
Fwd Header Length 0.330307 0.806061
Bwd Header Length 0.294374 0.711120
Fwd Packets/s -0.201649 -0.141910
Bwd Packets/s -0.215691 -0.126428
Packet Length Max 0.244895 0.591023
Packet Length Mean 0.160033 0.499634
Packet Length Std 0.211278 0.500338
Packet Length Variance 0.183152 0.456418
Avg Packet Size 0.131517 0.444920
Avg Fwd Segment Size 0.223883 0.443021
Avg Bwd Segment Size 0.069250 0.389019
Subflow Fwd Packets 0.344572 1.000000
Subflow Fwd Bytes 0.266345 0.580110
Subflow Bwd Packets 0.284508 0.693930
Subflow Bwd Bytes 0.227557 0.536628
Init Fwd Win Bytes -0.136505 0.028631
Init Bwd Win Bytes 0.220855 0.541543
Fwd Act Data Packets 0.245885 0.755914
Fwd Seg Size Min 0.172877 0.151001
attack_id 0.026159 -0.039465
Total Backward Packets Fwd Packets Length Total \
Flow Duration 0.284508 0.266345
Total Fwd Packets 0.693930 0.580110
Total Backward Packets 1.000000 0.457674
Fwd Packets Length Total 0.457674 1.000000
Bwd Packets Length Total 0.667482 0.470164
Fwd Packet Length Max 0.431439 0.796363
Fwd Packet Length Mean 0.454429 0.673515
Fwd Packet Length Std 0.418142 0.656346
Bwd Packet Length Max 0.626985 0.547430
Bwd Packet Length Mean 0.530979 0.397051
Bwd Packet Length Std 0.589303 0.512406
Flow Bytes/s 0.053830 -0.001175
Flow Packets/s -0.046278 -0.068197
Flow IAT Mean 0.098510 0.200397
Flow IAT Std 0.192020 0.312305
Flow IAT Max 0.129965 0.232288
Flow IAT Min -0.113239 -0.146314
Fwd IAT Total 0.272520 0.350136
Fwd IAT Mean 0.145574 0.293066
Fwd IAT Std 0.513079 0.413410
Fwd IAT Max 0.106457 0.292654
Fwd IAT Min 0.181549 0.101196
Bwd IAT Total 0.476214 0.259034
Bwd IAT Mean 0.163481 0.183088
Bwd IAT Std 0.512708 0.339885
Bwd IAT Max 0.521269 0.281791
Bwd IAT Min 0.075412 0.159474
Fwd Header Length 0.615261 0.570863
Bwd Header Length 0.846017 0.489739
Fwd Packets/s -0.068510 -0.072082
Bwd Packets/s -0.013203 -0.022332
Packet Length Max 0.596776 0.680678
Packet Length Mean 0.570523 0.577650
Packet Length Std 0.552656 0.554814
Packet Length Variance 0.457982 0.514811
Avg Packet Size 0.537342 0.535458
Avg Fwd Segment Size 0.454429 0.673515
Avg Bwd Segment Size 0.530979 0.397051
Subflow Fwd Packets 0.693930 0.580110
Subflow Fwd Bytes 0.457674 1.000000
Subflow Bwd Packets 1.000000 0.457674
Subflow Bwd Bytes 0.667482 0.470164
Init Fwd Win Bytes -0.018963 0.148935
Init Bwd Win Bytes 0.501559 0.477821
Fwd Act Data Packets 0.602988 0.559723
Fwd Seg Size Min 0.124126 0.200562
attack_id -0.064490 -0.085335
Bwd Packets Length Total Fwd Packet Length Max \
Flow Duration 0.227557 0.291309
Total Fwd Packets 0.536628 0.535619
Total Backward Packets 0.667482 0.431439
Fwd Packets Length Total 0.470164 0.796363
Bwd Packets Length Total 1.000000 0.451904
Fwd Packet Length Max 0.451904 1.000000
Fwd Packet Length Mean 0.433777 0.876929
Fwd Packet Length Std 0.401362 0.910410
Bwd Packet Length Max 0.642817 0.459193
Bwd Packet Length Mean 0.556433 0.387529
Bwd Packet Length Std 0.560662 0.475939
Flow Bytes/s -0.043166 -0.002276
Flow Packets/s -0.054143 -0.017606
Flow IAT Mean 0.038200 0.206706
Flow IAT Std 0.142081 0.403239
Flow IAT Max 0.117542 0.256882
Flow IAT Min -0.075916 -0.159033
Fwd IAT Total 0.261268 0.383010
Fwd IAT Mean 0.119108 0.364603
Fwd IAT Std 0.606764 0.324852
Fwd IAT Max 0.145319 0.323373
Fwd IAT Min 0.152866 0.067656
Bwd IAT Total 0.539414 0.198299
Bwd IAT Mean 0.119134 0.113974
Bwd IAT Std 0.601234 0.289248
Bwd IAT Max 0.637725 0.215992
Bwd IAT Min 0.043402 0.317312
Fwd Header Length 0.488606 0.580049
Bwd Header Length 0.559730 0.468355
Fwd Packets/s -0.064800 -0.024095
Bwd Packets/s -0.046593 -0.034213
Packet Length Max 0.601459 0.693306
Packet Length Mean 0.530830 0.624118
Packet Length Std 0.485171 0.605821
Packet Length Variance 0.521250 0.610889
Avg Packet Size 0.484636 0.592635
Avg Fwd Segment Size 0.433777 0.876929
Avg Bwd Segment Size 0.556433 0.387529
Subflow Fwd Packets 0.536628 0.535619
Subflow Fwd Bytes 0.470164 0.796363
Subflow Bwd Packets 0.667482 0.431439
Subflow Bwd Bytes 1.000000 0.451904
Init Fwd Win Bytes 0.058459 0.250035
Init Bwd Win Bytes 0.626598 0.463152
Fwd Act Data Packets 0.495346 0.397308
Fwd Seg Size Min 0.053893 0.267014
attack_id -0.035668 -0.114476
Fwd Packet Length Mean Fwd Packet Length Std \
Flow Duration 0.223883 0.258275
Total Fwd Packets 0.443021 0.460252
Total Backward Packets 0.454429 0.418142
Fwd Packets Length Total 0.673515 0.656346
Bwd Packets Length Total 0.433777 0.401362
Fwd Packet Length Max 0.876929 0.910410
Fwd Packet Length Mean 1.000000 0.906266
Fwd Packet Length Std 0.906266 1.000000
Bwd Packet Length Max 0.389350 0.396018
Bwd Packet Length Mean 0.430169 0.376194
Bwd Packet Length Std 0.406844 0.451969
Flow Bytes/s 0.088291 0.039993
Flow Packets/s 0.079698 0.047994
Flow IAT Mean 0.132827 0.174096
Flow IAT Std 0.350501 0.379778
Flow IAT Max 0.116476 0.219872
Flow IAT Min -0.096717 -0.162102
Fwd IAT Total 0.321079 0.333408
Fwd IAT Mean 0.314509 0.330904
Fwd IAT Std 0.285779 0.229975
Fwd IAT Max 0.178340 0.270654
Fwd IAT Min 0.065737 0.093723
Bwd IAT Total 0.203509 0.160607
Bwd IAT Mean 0.018704 0.077178
Bwd IAT Std 0.242220 0.221190
Bwd IAT Max 0.166386 0.154433
Bwd IAT Min 0.315777 0.378219
Fwd Header Length 0.467705 0.530178
Bwd Header Length 0.462312 0.456745
Fwd Packets/s 0.065484 0.035362
Bwd Packets/s 0.053065 -0.014289
Packet Length Max 0.611326 0.634520
Packet Length Mean 0.666252 0.595388
Packet Length Std 0.567894 0.610741
Packet Length Variance 0.576243 0.627071
Avg Packet Size 0.656824 0.563593
Avg Fwd Segment Size 1.000000 0.906266
Avg Bwd Segment Size 0.430169 0.376194
Subflow Fwd Packets 0.443021 0.460252
Subflow Fwd Bytes 0.673515 0.656346
Subflow Bwd Packets 0.454429 0.418142
Subflow Bwd Bytes 0.433777 0.401362
Init Fwd Win Bytes 0.261404 0.313570
Init Bwd Win Bytes 0.415983 0.390927
Fwd Act Data Packets 0.323995 0.282448
Fwd Seg Size Min 0.226251 0.302648
attack_id -0.196100 -0.141036
Bwd Packet Length Max Bwd Packet Length Mean ... \
Flow Duration 0.193896 0.069250 ...
Total Fwd Packets 0.559189 0.389019 ...
Total Backward Packets 0.626985 0.530979 ...
Fwd Packets Length Total 0.547430 0.397051 ...
Bwd Packets Length Total 0.642817 0.556433 ...
Fwd Packet Length Max 0.459193 0.387529 ...
Fwd Packet Length Mean 0.389350 0.430169 ...
Fwd Packet Length Std 0.396018 0.376194 ...
Bwd Packet Length Max 1.000000 0.745937 ...
Bwd Packet Length Mean 0.745937 1.000000 ...
Bwd Packet Length Std 0.905455 0.713763 ...
Flow Bytes/s 0.027773 0.059257 ...
Flow Packets/s -0.067909 0.017235 ...
Flow IAT Mean 0.122742 0.037822 ...
Flow IAT Std 0.130569 0.079935 ...
Flow IAT Max 0.291129 0.132577 ...
Flow IAT Min -0.128903 -0.045374 ...
Fwd IAT Total 0.192159 0.070620 ...
Fwd IAT Mean 0.110592 0.029596 ...
Fwd IAT Std 0.400185 0.257838 ...
Fwd IAT Max 0.303202 0.145865 ...
Fwd IAT Min 0.239714 0.143203 ...
Bwd IAT Total 0.388287 0.314509 ...
Bwd IAT Mean 0.308447 0.216200 ...
Bwd IAT Std 0.415092 0.319511 ...
Bwd IAT Max 0.441702 0.364072 ...
Bwd IAT Min 0.021272 0.069716 ...
Fwd Header Length 0.566809 0.398443 ...
Bwd Header Length 0.638425 0.530512 ...
Fwd Packets/s -0.079164 0.011930 ...
Bwd Packets/s -0.015688 0.105039 ...
Packet Length Max 0.916820 0.702215 ...
Packet Length Mean 0.736857 0.808163 ...
Packet Length Std 0.850145 0.682757 ...
Packet Length Variance 0.678489 0.724085 ...
Avg Packet Size 0.695708 0.801836 ...
Avg Fwd Segment Size 0.389350 0.430169 ...
Avg Bwd Segment Size 0.745937 1.000000 ...
Subflow Fwd Packets 0.559189 0.389019 ...
Subflow Fwd Bytes 0.547430 0.397051 ...
Subflow Bwd Packets 0.626985 0.530979 ...
Subflow Bwd Bytes 0.642817 0.556433 ...
Init Fwd Win Bytes 0.082989 0.125144 ...
Init Bwd Win Bytes 0.383273 0.277818 ...
Fwd Act Data Packets 0.585226 0.415084 ...
Fwd Seg Size Min 0.082755 0.017201 ...
attack_id -0.064293 -0.085765 ...
Avg Bwd Segment Size Subflow Fwd Packets \
Flow Duration 0.069250 0.344572
Total Fwd Packets 0.389019 1.000000
Total Backward Packets 0.530979 0.693930
Fwd Packets Length Total 0.397051 0.580110
Bwd Packets Length Total 0.556433 0.536628
Fwd Packet Length Max 0.387529 0.535619
Fwd Packet Length Mean 0.430169 0.443021
Fwd Packet Length Std 0.376194 0.460252
Bwd Packet Length Max 0.745937 0.559189
Bwd Packet Length Mean 1.000000 0.389019
Bwd Packet Length Std 0.713763 0.526333
Flow Bytes/s 0.059257 -0.056336
Flow Packets/s 0.017235 -0.142290
Flow IAT Mean 0.037822 0.155158
Flow IAT Std 0.079935 0.229385
Flow IAT Max 0.132577 0.198171
Flow IAT Min -0.045374 -0.165506
Fwd IAT Total 0.070620 0.396678
Fwd IAT Mean 0.029596 0.244505
Fwd IAT Std 0.257838 0.526483
Fwd IAT Max 0.145865 0.225066
Fwd IAT Min 0.143203 0.121484
Bwd IAT Total 0.314509 0.324204
Bwd IAT Mean 0.216200 0.192164
Bwd IAT Std 0.319511 0.448569
Bwd IAT Max 0.364072 0.410429
Bwd IAT Min 0.069716 0.098966
Fwd Header Length 0.398443 0.806061
Bwd Header Length 0.530512 0.711120
Fwd Packets/s 0.011930 -0.141910
Bwd Packets/s 0.105039 -0.126428
Packet Length Max 0.702215 0.591023
Packet Length Mean 0.808163 0.499634
Packet Length Std 0.682757 0.500338
Packet Length Variance 0.724085 0.456418
Avg Packet Size 0.801836 0.444920
Avg Fwd Segment Size 0.430169 0.443021
Avg Bwd Segment Size 1.000000 0.389019
Subflow Fwd Packets 0.389019 1.000000
Subflow Fwd Bytes 0.397051 0.580110
Subflow Bwd Packets 0.530979 0.693930
Subflow Bwd Bytes 0.556433 0.536628
Init Fwd Win Bytes 0.125144 0.028631
Init Bwd Win Bytes 0.277818 0.541543
Fwd Act Data Packets 0.415084 0.755914
Fwd Seg Size Min 0.017201 0.151001
attack_id -0.085765 -0.039465
Subflow Fwd Bytes Subflow Bwd Packets \
Flow Duration 0.266345 0.284508
Total Fwd Packets 0.580110 0.693930
Total Backward Packets 0.457674 1.000000
Fwd Packets Length Total 1.000000 0.457674
Bwd Packets Length Total 0.470164 0.667482
Fwd Packet Length Max 0.796363 0.431439
Fwd Packet Length Mean 0.673515 0.454429
Fwd Packet Length Std 0.656346 0.418142
Bwd Packet Length Max 0.547430 0.626985
Bwd Packet Length Mean 0.397051 0.530979
Bwd Packet Length Std 0.512406 0.589303
Flow Bytes/s -0.001175 0.053830
Flow Packets/s -0.068197 -0.046278
Flow IAT Mean 0.200397 0.098510
Flow IAT Std 0.312305 0.192020
Flow IAT Max 0.232288 0.129965
Flow IAT Min -0.146314 -0.113239
Fwd IAT Total 0.350136 0.272520
Fwd IAT Mean 0.293066 0.145574
Fwd IAT Std 0.413410 0.513079
Fwd IAT Max 0.292654 0.106457
Fwd IAT Min 0.101196 0.181549
Bwd IAT Total 0.259034 0.476214
Bwd IAT Mean 0.183088 0.163481
Bwd IAT Std 0.339885 0.512708
Bwd IAT Max 0.281791 0.521269
Bwd IAT Min 0.159474 0.075412
Fwd Header Length 0.570863 0.615261
Bwd Header Length 0.489739 0.846017
Fwd Packets/s -0.072082 -0.068510
Bwd Packets/s -0.022332 -0.013203
Packet Length Max 0.680678 0.596776
Packet Length Mean 0.577650 0.570523
Packet Length Std 0.554814 0.552656
Packet Length Variance 0.514811 0.457982
Avg Packet Size 0.535458 0.537342
Avg Fwd Segment Size 0.673515 0.454429
Avg Bwd Segment Size 0.397051 0.530979
Subflow Fwd Packets 0.580110 0.693930
Subflow Fwd Bytes 1.000000 0.457674
Subflow Bwd Packets 0.457674 1.000000
Subflow Bwd Bytes 0.470164 0.667482
Init Fwd Win Bytes 0.148935 -0.018963
Init Bwd Win Bytes 0.477821 0.501559
Fwd Act Data Packets 0.559723 0.602988
Fwd Seg Size Min 0.200562 0.124126
attack_id -0.085335 -0.064490
Subflow Bwd Bytes Init Fwd Win Bytes \
Flow Duration 0.227557 -0.136505
Total Fwd Packets 0.536628 0.028631
Total Backward Packets 0.667482 -0.018963
Fwd Packets Length Total 0.470164 0.148935
Bwd Packets Length Total 1.000000 0.058459
Fwd Packet Length Max 0.451904 0.250035
Fwd Packet Length Mean 0.433777 0.261404
Fwd Packet Length Std 0.401362 0.313570
Bwd Packet Length Max 0.642817 0.082989
Bwd Packet Length Mean 0.556433 0.125144
Bwd Packet Length Std 0.560662 0.169575
Flow Bytes/s -0.043166 0.062303
Flow Packets/s -0.054143 0.192189
Flow IAT Mean 0.038200 -0.113184
Flow IAT Std 0.142081 -0.153843
Flow IAT Max 0.117542 -0.107938
Flow IAT Min -0.075916 -0.073047
Fwd IAT Total 0.261268 -0.131350
Fwd IAT Mean 0.119108 -0.124701
Fwd IAT Std 0.606764 -0.037134
Fwd IAT Max 0.145319 -0.092624
Fwd IAT Min 0.152866 0.077115
Bwd IAT Total 0.539414 -0.045929
Bwd IAT Mean 0.119134 -0.069465
Bwd IAT Std 0.601234 -0.068592
Bwd IAT Max 0.637725 -0.070402
Bwd IAT Min 0.043402 0.246999
Fwd Header Length 0.488606 0.013206
Bwd Header Length 0.559730 -0.039221
Fwd Packets/s -0.064800 0.188806
Bwd Packets/s -0.046593 0.130409
Packet Length Max 0.601459 0.171932
Packet Length Mean 0.530830 0.183537
Packet Length Std 0.485171 0.201096
Packet Length Variance 0.521250 0.236211
Avg Packet Size 0.484636 0.178445
Avg Fwd Segment Size 0.433777 0.261404
Avg Bwd Segment Size 0.556433 0.125144
Subflow Fwd Packets 0.536628 0.028631
Subflow Fwd Bytes 0.470164 0.148935
Subflow Bwd Packets 0.667482 -0.018963
Subflow Bwd Bytes 1.000000 0.058459
Init Fwd Win Bytes 0.058459 1.000000
Init Bwd Win Bytes 0.626598 0.152317
Fwd Act Data Packets 0.495346 -0.036225
Fwd Seg Size Min 0.053893 0.032016
attack_id -0.035668 -0.024570
Init Bwd Win Bytes Fwd Act Data Packets \
Flow Duration 0.220855 0.245885
Total Fwd Packets 0.541543 0.755914
Total Backward Packets 0.501559 0.602988
Fwd Packets Length Total 0.477821 0.559723
Bwd Packets Length Total 0.626598 0.495346
Fwd Packet Length Max 0.463152 0.397308
Fwd Packet Length Mean 0.415983 0.323995
Fwd Packet Length Std 0.390927 0.282448
Bwd Packet Length Max 0.383273 0.585226
Bwd Packet Length Mean 0.277818 0.415084
Bwd Packet Length Std 0.315921 0.529967
Flow Bytes/s -0.084479 -0.040516
Flow Packets/s -0.079840 -0.139593
Flow IAT Mean 0.024429 0.145694
Flow IAT Std 0.022531 0.130074
Flow IAT Max -0.014380 0.185398
Flow IAT Min -0.040549 -0.130984
Fwd IAT Total 0.291423 0.296800
Fwd IAT Mean 0.104470 0.180328
Fwd IAT Std 0.699467 0.485163
Fwd IAT Max 0.029922 0.221819
Fwd IAT Min -0.004198 0.112142
Bwd IAT Total 0.470894 0.332644
Bwd IAT Mean -0.046640 0.291875
Bwd IAT Std 0.558057 0.408957
Bwd IAT Max 0.513751 0.377197
Bwd IAT Min 0.064453 -0.009818
Fwd Header Length 0.424312 0.667525
Bwd Header Length 0.380560 0.616584
Fwd Packets/s -0.073528 -0.143159
Bwd Packets/s -0.069479 -0.095185
Packet Length Max 0.393128 0.558498
Packet Length Mean 0.320431 0.481511
Packet Length Std 0.287666 0.470960
Packet Length Variance 0.344496 0.378085
Avg Packet Size 0.274867 0.434184
Avg Fwd Segment Size 0.415983 0.323995
Avg Bwd Segment Size 0.277818 0.415084
Subflow Fwd Packets 0.541543 0.755914
Subflow Fwd Bytes 0.477821 0.559723
Subflow Bwd Packets 0.501559 0.602988
Subflow Bwd Bytes 0.626598 0.495346
Init Fwd Win Bytes 0.152317 -0.036225
Init Bwd Win Bytes 1.000000 0.467861
Fwd Act Data Packets 0.467861 1.000000
Fwd Seg Size Min -0.034003 0.049872
attack_id -0.188538 -0.086071
Fwd Seg Size Min attack_id
Flow Duration 0.172877 0.026159
Total Fwd Packets 0.151001 -0.039465
Total Backward Packets 0.124126 -0.064490
Fwd Packets Length Total 0.200562 -0.085335
Bwd Packets Length Total 0.053893 -0.035668
Fwd Packet Length Max 0.267014 -0.114476
Fwd Packet Length Mean 0.226251 -0.196100
Fwd Packet Length Std 0.302648 -0.141036
Bwd Packet Length Max 0.082755 -0.064293
Bwd Packet Length Mean 0.017201 -0.085765
Bwd Packet Length Std 0.127629 -0.039774
Flow Bytes/s -0.078757 -0.014417
Flow Packets/s -0.120632 0.040890
Flow IAT Mean 0.148807 0.016617
Flow IAT Std 0.246742 0.012121
Flow IAT Max 0.162000 -0.049590
Flow IAT Min -0.309198 -0.098165
Fwd IAT Total 0.204756 -0.017367
Fwd IAT Mean 0.221853 -0.019111
Fwd IAT Std -0.002423 -0.121112
Fwd IAT Max 0.178686 -0.085004
Fwd IAT Min 0.057986 0.108564
Bwd IAT Total 0.017300 0.046394
Bwd IAT Mean 0.057588 -0.032041
Bwd IAT Std 0.029104 0.013587
Bwd IAT Max 0.011774 0.098360
Bwd IAT Min 0.107613 -0.056871
Fwd Header Length 0.327892 -0.021366
Bwd Header Length 0.264664 -0.029029
Fwd Packets/s -0.127855 0.037160
Bwd Packets/s -0.072828 -0.080766
Packet Length Max 0.154635 -0.061398
Packet Length Mean 0.099643 -0.098992
Packet Length Std 0.152382 -0.097046
Packet Length Variance 0.172997 -0.023701
Avg Packet Size 0.058608 -0.103647
Avg Fwd Segment Size 0.226251 -0.196100
Avg Bwd Segment Size 0.017201 -0.085765
Subflow Fwd Packets 0.151001 -0.039465
Subflow Fwd Bytes 0.200562 -0.085335
Subflow Bwd Packets 0.124126 -0.064490
Subflow Bwd Bytes 0.053893 -0.035668
Init Fwd Win Bytes 0.032016 -0.024570
Init Bwd Win Bytes -0.034003 -0.188538
Fwd Act Data Packets 0.049872 -0.086071
Fwd Seg Size Min 1.000000 0.109920
attack_id 0.109920 1.000000
[47 rows x 47 columns]
for i in range(len(corr_matrix)):
print(f"Row {i}: {corr_matrix.iloc[i]}")
Row 0: Flow Duration 1.000000 Total Fwd Packets 0.344572 Total Backward Packets 0.284508 Fwd Packets Length Total 0.266345 Bwd Packets Length Total 0.227557 Fwd Packet Length Max 0.291309 Fwd Packet Length Mean 0.223883 Fwd Packet Length Std 0.258275 Bwd Packet Length Max 0.193896 Bwd Packet Length Mean 0.069250 Bwd Packet Length Std 0.208548 Flow Bytes/s -0.168463 Flow Packets/s -0.201210 Flow IAT Mean 0.527404 Flow IAT Std 0.565126 Flow IAT Max 0.530732 Flow IAT Min -0.087898 Fwd IAT Total 0.780641 Fwd IAT Mean 0.518293 Fwd IAT Std 0.262062 Fwd IAT Max 0.359788 Fwd IAT Min 0.042262 Bwd IAT Total 0.124080 Bwd IAT Mean 0.020241 Bwd IAT Std 0.206601 Bwd IAT Max 0.209195 Bwd IAT Min 0.110107 Fwd Header Length 0.330307 Bwd Header Length 0.294374 Fwd Packets/s -0.201649 Bwd Packets/s -0.215691 Packet Length Max 0.244895 Packet Length Mean 0.160033 Packet Length Std 0.211278 Packet Length Variance 0.183152 Avg Packet Size 0.131517 Avg Fwd Segment Size 0.223883 Avg Bwd Segment Size 0.069250 Subflow Fwd Packets 0.344572 Subflow Fwd Bytes 0.266345 Subflow Bwd Packets 0.284508 Subflow Bwd Bytes 0.227557 Init Fwd Win Bytes -0.136505 Init Bwd Win Bytes 0.220855 Fwd Act Data Packets 0.245885 Fwd Seg Size Min 0.172877 attack_id 0.026159 Name: Flow Duration, dtype: float64 Row 1: Flow Duration 0.344572 Total Fwd Packets 1.000000 Total Backward Packets 0.693930 Fwd Packets Length Total 0.580110 Bwd Packets Length Total 0.536628 Fwd Packet Length Max 0.535619 Fwd Packet Length Mean 0.443021 Fwd Packet Length Std 0.460252 Bwd Packet Length Max 0.559189 Bwd Packet Length Mean 0.389019 Bwd Packet Length Std 0.526333 Flow Bytes/s -0.056336 Flow Packets/s -0.142290 Flow IAT Mean 0.155158 Flow IAT Std 0.229385 Flow IAT Max 0.198171 Flow IAT Min -0.165506 Fwd IAT Total 0.396678 Fwd IAT Mean 0.244505 Fwd IAT Std 0.526483 Fwd IAT Max 0.225066 Fwd IAT Min 0.121484 Bwd IAT Total 0.324204 Bwd IAT Mean 0.192164 Bwd IAT Std 0.448569 Bwd IAT Max 0.410429 Bwd IAT Min 0.098966 Fwd Header Length 0.806061 Bwd Header Length 0.711120 Fwd Packets/s -0.141910 Bwd Packets/s -0.126428 Packet Length Max 0.591023 Packet Length Mean 0.499634 Packet Length Std 0.500338 Packet Length Variance 0.456418 Avg Packet Size 0.444920 Avg Fwd Segment Size 0.443021 Avg Bwd Segment Size 0.389019 Subflow Fwd Packets 1.000000 Subflow Fwd Bytes 0.580110 Subflow Bwd Packets 0.693930 Subflow Bwd Bytes 0.536628 Init Fwd Win Bytes 0.028631 Init Bwd Win Bytes 0.541543 Fwd Act Data Packets 0.755914 Fwd Seg Size Min 0.151001 attack_id -0.039465 Name: Total Fwd Packets, dtype: float64 Row 2: Flow Duration 0.284508 Total Fwd Packets 0.693930 Total Backward Packets 1.000000 Fwd Packets Length Total 0.457674 Bwd Packets Length Total 0.667482 Fwd Packet Length Max 0.431439 Fwd Packet Length Mean 0.454429 Fwd Packet Length Std 0.418142 Bwd Packet Length Max 0.626985 Bwd Packet Length Mean 0.530979 Bwd Packet Length Std 0.589303 Flow Bytes/s 0.053830 Flow Packets/s -0.046278 Flow IAT Mean 0.098510 Flow IAT Std 0.192020 Flow IAT Max 0.129965 Flow IAT Min -0.113239 Fwd IAT Total 0.272520 Fwd IAT Mean 0.145574 Fwd IAT Std 0.513079 Fwd IAT Max 0.106457 Fwd IAT Min 0.181549 Bwd IAT Total 0.476214 Bwd IAT Mean 0.163481 Bwd IAT Std 0.512708 Bwd IAT Max 0.521269 Bwd IAT Min 0.075412 Fwd Header Length 0.615261 Bwd Header Length 0.846017 Fwd Packets/s -0.068510 Bwd Packets/s -0.013203 Packet Length Max 0.596776 Packet Length Mean 0.570523 Packet Length Std 0.552656 Packet Length Variance 0.457982 Avg Packet Size 0.537342 Avg Fwd Segment Size 0.454429 Avg Bwd Segment Size 0.530979 Subflow Fwd Packets 0.693930 Subflow Fwd Bytes 0.457674 Subflow Bwd Packets 1.000000 Subflow Bwd Bytes 0.667482 Init Fwd Win Bytes -0.018963 Init Bwd Win Bytes 0.501559 Fwd Act Data Packets 0.602988 Fwd Seg Size Min 0.124126 attack_id -0.064490 Name: Total Backward Packets, dtype: float64 Row 3: Flow Duration 0.266345 Total Fwd Packets 0.580110 Total Backward Packets 0.457674 Fwd Packets Length Total 1.000000 Bwd Packets Length Total 0.470164 Fwd Packet Length Max 0.796363 Fwd Packet Length Mean 0.673515 Fwd Packet Length Std 0.656346 Bwd Packet Length Max 0.547430 Bwd Packet Length Mean 0.397051 Bwd Packet Length Std 0.512406 Flow Bytes/s -0.001175 Flow Packets/s -0.068197 Flow IAT Mean 0.200397 Flow IAT Std 0.312305 Flow IAT Max 0.232288 Flow IAT Min -0.146314 Fwd IAT Total 0.350136 Fwd IAT Mean 0.293066 Fwd IAT Std 0.413410 Fwd IAT Max 0.292654 Fwd IAT Min 0.101196 Bwd IAT Total 0.259034 Bwd IAT Mean 0.183088 Bwd IAT Std 0.339885 Bwd IAT Max 0.281791 Bwd IAT Min 0.159474 Fwd Header Length 0.570863 Bwd Header Length 0.489739 Fwd Packets/s -0.072082 Bwd Packets/s -0.022332 Packet Length Max 0.680678 Packet Length Mean 0.577650 Packet Length Std 0.554814 Packet Length Variance 0.514811 Avg Packet Size 0.535458 Avg Fwd Segment Size 0.673515 Avg Bwd Segment Size 0.397051 Subflow Fwd Packets 0.580110 Subflow Fwd Bytes 1.000000 Subflow Bwd Packets 0.457674 Subflow Bwd Bytes 0.470164 Init Fwd Win Bytes 0.148935 Init Bwd Win Bytes 0.477821 Fwd Act Data Packets 0.559723 Fwd Seg Size Min 0.200562 attack_id -0.085335 Name: Fwd Packets Length Total, dtype: float64 Row 4: Flow Duration 0.227557 Total Fwd Packets 0.536628 Total Backward Packets 0.667482 Fwd Packets Length Total 0.470164 Bwd Packets Length Total 1.000000 Fwd Packet Length Max 0.451904 Fwd Packet Length Mean 0.433777 Fwd Packet Length Std 0.401362 Bwd Packet Length Max 0.642817 Bwd Packet Length Mean 0.556433 Bwd Packet Length Std 0.560662 Flow Bytes/s -0.043166 Flow Packets/s -0.054143 Flow IAT Mean 0.038200 Flow IAT Std 0.142081 Flow IAT Max 0.117542 Flow IAT Min -0.075916 Fwd IAT Total 0.261268 Fwd IAT Mean 0.119108 Fwd IAT Std 0.606764 Fwd IAT Max 0.145319 Fwd IAT Min 0.152866 Bwd IAT Total 0.539414 Bwd IAT Mean 0.119134 Bwd IAT Std 0.601234 Bwd IAT Max 0.637725 Bwd IAT Min 0.043402 Fwd Header Length 0.488606 Bwd Header Length 0.559730 Fwd Packets/s -0.064800 Bwd Packets/s -0.046593 Packet Length Max 0.601459 Packet Length Mean 0.530830 Packet Length Std 0.485171 Packet Length Variance 0.521250 Avg Packet Size 0.484636 Avg Fwd Segment Size 0.433777 Avg Bwd Segment Size 0.556433 Subflow Fwd Packets 0.536628 Subflow Fwd Bytes 0.470164 Subflow Bwd Packets 0.667482 Subflow Bwd Bytes 1.000000 Init Fwd Win Bytes 0.058459 Init Bwd Win Bytes 0.626598 Fwd Act Data Packets 0.495346 Fwd Seg Size Min 0.053893 attack_id -0.035668 Name: Bwd Packets Length Total, dtype: float64 Row 5: Flow Duration 0.291309 Total Fwd Packets 0.535619 Total Backward Packets 0.431439 Fwd Packets Length Total 0.796363 Bwd Packets Length Total 0.451904 Fwd Packet Length Max 1.000000 Fwd Packet Length Mean 0.876929 Fwd Packet Length Std 0.910410 Bwd Packet Length Max 0.459193 Bwd Packet Length Mean 0.387529 Bwd Packet Length Std 0.475939 Flow Bytes/s -0.002276 Flow Packets/s -0.017606 Flow IAT Mean 0.206706 Flow IAT Std 0.403239 Flow IAT Max 0.256882 Flow IAT Min -0.159033 Fwd IAT Total 0.383010 Fwd IAT Mean 0.364603 Fwd IAT Std 0.324852 Fwd IAT Max 0.323373 Fwd IAT Min 0.067656 Bwd IAT Total 0.198299 Bwd IAT Mean 0.113974 Bwd IAT Std 0.289248 Bwd IAT Max 0.215992 Bwd IAT Min 0.317312 Fwd Header Length 0.580049 Bwd Header Length 0.468355 Fwd Packets/s -0.024095 Bwd Packets/s -0.034213 Packet Length Max 0.693306 Packet Length Mean 0.624118 Packet Length Std 0.605821 Packet Length Variance 0.610889 Avg Packet Size 0.592635 Avg Fwd Segment Size 0.876929 Avg Bwd Segment Size 0.387529 Subflow Fwd Packets 0.535619 Subflow Fwd Bytes 0.796363 Subflow Bwd Packets 0.431439 Subflow Bwd Bytes 0.451904 Init Fwd Win Bytes 0.250035 Init Bwd Win Bytes 0.463152 Fwd Act Data Packets 0.397308 Fwd Seg Size Min 0.267014 attack_id -0.114476 Name: Fwd Packet Length Max, dtype: float64 Row 6: Flow Duration 0.223883 Total Fwd Packets 0.443021 Total Backward Packets 0.454429 Fwd Packets Length Total 0.673515 Bwd Packets Length Total 0.433777 Fwd Packet Length Max 0.876929 Fwd Packet Length Mean 1.000000 Fwd Packet Length Std 0.906266 Bwd Packet Length Max 0.389350 Bwd Packet Length Mean 0.430169 Bwd Packet Length Std 0.406844 Flow Bytes/s 0.088291 Flow Packets/s 0.079698 Flow IAT Mean 0.132827 Flow IAT Std 0.350501 Flow IAT Max 0.116476 Flow IAT Min -0.096717 Fwd IAT Total 0.321079 Fwd IAT Mean 0.314509 Fwd IAT Std 0.285779 Fwd IAT Max 0.178340 Fwd IAT Min 0.065737 Bwd IAT Total 0.203509 Bwd IAT Mean 0.018704 Bwd IAT Std 0.242220 Bwd IAT Max 0.166386 Bwd IAT Min 0.315777 Fwd Header Length 0.467705 Bwd Header Length 0.462312 Fwd Packets/s 0.065484 Bwd Packets/s 0.053065 Packet Length Max 0.611326 Packet Length Mean 0.666252 Packet Length Std 0.567894 Packet Length Variance 0.576243 Avg Packet Size 0.656824 Avg Fwd Segment Size 1.000000 Avg Bwd Segment Size 0.430169 Subflow Fwd Packets 0.443021 Subflow Fwd Bytes 0.673515 Subflow Bwd Packets 0.454429 Subflow Bwd Bytes 0.433777 Init Fwd Win Bytes 0.261404 Init Bwd Win Bytes 0.415983 Fwd Act Data Packets 0.323995 Fwd Seg Size Min 0.226251 attack_id -0.196100 Name: Fwd Packet Length Mean, dtype: float64 Row 7: Flow Duration 0.258275 Total Fwd Packets 0.460252 Total Backward Packets 0.418142 Fwd Packets Length Total 0.656346 Bwd Packets Length Total 0.401362 Fwd Packet Length Max 0.910410 Fwd Packet Length Mean 0.906266 Fwd Packet Length Std 1.000000 Bwd Packet Length Max 0.396018 Bwd Packet Length Mean 0.376194 Bwd Packet Length Std 0.451969 Flow Bytes/s 0.039993 Flow Packets/s 0.047994 Flow IAT Mean 0.174096 Flow IAT Std 0.379778 Flow IAT Max 0.219872 Flow IAT Min -0.162102 Fwd IAT Total 0.333408 Fwd IAT Mean 0.330904 Fwd IAT Std 0.229975 Fwd IAT Max 0.270654 Fwd IAT Min 0.093723 Bwd IAT Total 0.160607 Bwd IAT Mean 0.077178 Bwd IAT Std 0.221190 Bwd IAT Max 0.154433 Bwd IAT Min 0.378219 Fwd Header Length 0.530178 Bwd Header Length 0.456745 Fwd Packets/s 0.035362 Bwd Packets/s -0.014289 Packet Length Max 0.634520 Packet Length Mean 0.595388 Packet Length Std 0.610741 Packet Length Variance 0.627071 Avg Packet Size 0.563593 Avg Fwd Segment Size 0.906266 Avg Bwd Segment Size 0.376194 Subflow Fwd Packets 0.460252 Subflow Fwd Bytes 0.656346 Subflow Bwd Packets 0.418142 Subflow Bwd Bytes 0.401362 Init Fwd Win Bytes 0.313570 Init Bwd Win Bytes 0.390927 Fwd Act Data Packets 0.282448 Fwd Seg Size Min 0.302648 attack_id -0.141036 Name: Fwd Packet Length Std, dtype: float64 Row 8: Flow Duration 0.193896 Total Fwd Packets 0.559189 Total Backward Packets 0.626985 Fwd Packets Length Total 0.547430 Bwd Packets Length Total 0.642817 Fwd Packet Length Max 0.459193 Fwd Packet Length Mean 0.389350 Fwd Packet Length Std 0.396018 Bwd Packet Length Max 1.000000 Bwd Packet Length Mean 0.745937 Bwd Packet Length Std 0.905455 Flow Bytes/s 0.027773 Flow Packets/s -0.067909 Flow IAT Mean 0.122742 Flow IAT Std 0.130569 Flow IAT Max 0.291129 Flow IAT Min -0.128903 Fwd IAT Total 0.192159 Fwd IAT Mean 0.110592 Fwd IAT Std 0.400185 Fwd IAT Max 0.303202 Fwd IAT Min 0.239714 Bwd IAT Total 0.388287 Bwd IAT Mean 0.308447 Bwd IAT Std 0.415092 Bwd IAT Max 0.441702 Bwd IAT Min 0.021272 Fwd Header Length 0.566809 Bwd Header Length 0.638425 Fwd Packets/s -0.079164 Bwd Packets/s -0.015688 Packet Length Max 0.916820 Packet Length Mean 0.736857 Packet Length Std 0.850145 Packet Length Variance 0.678489 Avg Packet Size 0.695708 Avg Fwd Segment Size 0.389350 Avg Bwd Segment Size 0.745937 Subflow Fwd Packets 0.559189 Subflow Fwd Bytes 0.547430 Subflow Bwd Packets 0.626985 Subflow Bwd Bytes 0.642817 Init Fwd Win Bytes 0.082989 Init Bwd Win Bytes 0.383273 Fwd Act Data Packets 0.585226 Fwd Seg Size Min 0.082755 attack_id -0.064293 Name: Bwd Packet Length Max, dtype: float64 Row 9: Flow Duration 0.069250 Total Fwd Packets 0.389019 Total Backward Packets 0.530979 Fwd Packets Length Total 0.397051 Bwd Packets Length Total 0.556433 Fwd Packet Length Max 0.387529 Fwd Packet Length Mean 0.430169 Fwd Packet Length Std 0.376194 Bwd Packet Length Max 0.745937 Bwd Packet Length Mean 1.000000 Bwd Packet Length Std 0.713763 Flow Bytes/s 0.059257 Flow Packets/s 0.017235 Flow IAT Mean 0.037822 Flow IAT Std 0.079935 Flow IAT Max 0.132577 Flow IAT Min -0.045374 Fwd IAT Total 0.070620 Fwd IAT Mean 0.029596 Fwd IAT Std 0.257838 Fwd IAT Max 0.145865 Fwd IAT Min 0.143203 Bwd IAT Total 0.314509 Bwd IAT Mean 0.216200 Bwd IAT Std 0.319511 Bwd IAT Max 0.364072 Bwd IAT Min 0.069716 Fwd Header Length 0.398443 Bwd Header Length 0.530512 Fwd Packets/s 0.011930 Bwd Packets/s 0.105039 Packet Length Max 0.702215 Packet Length Mean 0.808163 Packet Length Std 0.682757 Packet Length Variance 0.724085 Avg Packet Size 0.801836 Avg Fwd Segment Size 0.430169 Avg Bwd Segment Size 1.000000 Subflow Fwd Packets 0.389019 Subflow Fwd Bytes 0.397051 Subflow Bwd Packets 0.530979 Subflow Bwd Bytes 0.556433 Init Fwd Win Bytes 0.125144 Init Bwd Win Bytes 0.277818 Fwd Act Data Packets 0.415084 Fwd Seg Size Min 0.017201 attack_id -0.085765 Name: Bwd Packet Length Mean, dtype: float64 Row 10: Flow Duration 0.208548 Total Fwd Packets 0.526333 Total Backward Packets 0.589303 Fwd Packets Length Total 0.512406 Bwd Packets Length Total 0.560662 Fwd Packet Length Max 0.475939 Fwd Packet Length Mean 0.406844 Fwd Packet Length Std 0.451969 Bwd Packet Length Max 0.905455 Bwd Packet Length Mean 0.713763 Bwd Packet Length Std 1.000000 Flow Bytes/s 0.031455 Flow Packets/s -0.046486 Flow IAT Mean 0.165044 Flow IAT Std 0.185775 Flow IAT Max 0.307785 Flow IAT Min -0.161815 Fwd IAT Total 0.189628 Fwd IAT Mean 0.144140 Fwd IAT Std 0.315058 Fwd IAT Max 0.304261 Fwd IAT Min 0.230290 Bwd IAT Total 0.356071 Bwd IAT Mean 0.264339 Bwd IAT Std 0.361693 Bwd IAT Max 0.404445 Bwd IAT Min 0.080120 Fwd Header Length 0.534625 Bwd Header Length 0.606914 Fwd Packets/s -0.060242 Bwd Packets/s -0.021202 Packet Length Max 0.866421 Packet Length Mean 0.730641 Packet Length Std 0.886948 Packet Length Variance 0.685478 Avg Packet Size 0.701522 Avg Fwd Segment Size 0.406844 Avg Bwd Segment Size 0.713763 Subflow Fwd Packets 0.526333 Subflow Fwd Bytes 0.512406 Subflow Bwd Packets 0.589303 Subflow Bwd Bytes 0.560662 Init Fwd Win Bytes 0.169575 Init Bwd Win Bytes 0.315921 Fwd Act Data Packets 0.529967 Fwd Seg Size Min 0.127629 attack_id -0.039774 Name: Bwd Packet Length Std, dtype: float64 Row 11: Flow Duration -0.168463 Total Fwd Packets -0.056336 Total Backward Packets 0.053830 Fwd Packets Length Total -0.001175 Bwd Packets Length Total -0.043166 Fwd Packet Length Max -0.002276 Fwd Packet Length Mean 0.088291 Fwd Packet Length Std 0.039993 Bwd Packet Length Max 0.027773 Bwd Packet Length Mean 0.059257 Bwd Packet Length Std 0.031455 Flow Bytes/s 1.000000 Flow Packets/s 0.434222 Flow IAT Mean -0.178772 Flow IAT Std -0.133443 Flow IAT Max -0.175313 Flow IAT Min -0.041821 Fwd IAT Total -0.142582 Fwd IAT Mean -0.143920 Fwd IAT Std -0.069871 Fwd IAT Max -0.155532 Fwd IAT Min 0.036841 Bwd IAT Total -0.065239 Bwd IAT Mean -0.143259 Bwd IAT Std -0.055435 Bwd IAT Max -0.083577 Bwd IAT Min -0.052498 Fwd Header Length -0.082406 Bwd Header Length 0.010898 Fwd Packets/s 0.382884 Bwd Packets/s 0.383061 Packet Length Max 0.031106 Packet Length Mean 0.093525 Packet Length Std 0.062113 Packet Length Variance -0.019063 Avg Packet Size 0.107747 Avg Fwd Segment Size 0.088291 Avg Bwd Segment Size 0.059257 Subflow Fwd Packets -0.056336 Subflow Fwd Bytes -0.001175 Subflow Bwd Packets 0.053830 Subflow Bwd Bytes -0.043166 Init Fwd Win Bytes 0.062303 Init Bwd Win Bytes -0.084479 Fwd Act Data Packets -0.040516 Fwd Seg Size Min -0.078757 attack_id -0.014417 Name: Flow Bytes/s, dtype: float64 Row 12: Flow Duration -0.201210 Total Fwd Packets -0.142290 Total Backward Packets -0.046278 Fwd Packets Length Total -0.068197 Bwd Packets Length Total -0.054143 Fwd Packet Length Max -0.017606 Fwd Packet Length Mean 0.079698 Fwd Packet Length Std 0.047994 Bwd Packet Length Max -0.067909 Bwd Packet Length Mean 0.017235 Bwd Packet Length Std -0.046486 Flow Bytes/s 0.434222 Flow Packets/s 1.000000 Flow IAT Mean -0.194094 Flow IAT Std -0.159992 Flow IAT Max -0.195863 Flow IAT Min -0.060208 Fwd IAT Total -0.169590 Fwd IAT Mean -0.160047 Fwd IAT Std -0.110826 Fwd IAT Max -0.171152 Fwd IAT Min 0.025983 Bwd IAT Total -0.117358 Bwd IAT Mean -0.147376 Bwd IAT Std -0.115618 Bwd IAT Max -0.133601 Bwd IAT Min -0.069277 Fwd Header Length -0.153481 Bwd Header Length -0.089666 Fwd Packets/s 0.835874 Bwd Packets/s 0.258343 Packet Length Max -0.048260 Packet Length Mean 0.015124 Packet Length Std -0.015276 Packet Length Variance -0.014598 Avg Packet Size 0.038576 Avg Fwd Segment Size 0.079698 Avg Bwd Segment Size 0.017235 Subflow Fwd Packets -0.142290 Subflow Fwd Bytes -0.068197 Subflow Bwd Packets -0.046278 Subflow Bwd Bytes -0.054143 Init Fwd Win Bytes 0.192189 Init Bwd Win Bytes -0.079840 Fwd Act Data Packets -0.139593 Fwd Seg Size Min -0.120632 attack_id 0.040890 Name: Flow Packets/s, dtype: float64 Row 13: Flow Duration 0.527404 Total Fwd Packets 0.155158 Total Backward Packets 0.098510 Fwd Packets Length Total 0.200397 Bwd Packets Length Total 0.038200 Fwd Packet Length Max 0.206706 Fwd Packet Length Mean 0.132827 Fwd Packet Length Std 0.174096 Bwd Packet Length Max 0.122742 Bwd Packet Length Mean 0.037822 Bwd Packet Length Std 0.165044 Flow Bytes/s -0.178772 Flow Packets/s -0.194094 Flow IAT Mean 1.000000 Flow IAT Std 0.488745 Flow IAT Max 0.480091 Flow IAT Min -0.092538 Fwd IAT Total 0.448291 Fwd IAT Mean 0.623333 Fwd IAT Std 0.024000 Fwd IAT Max 0.401846 Fwd IAT Min 0.032743 Bwd IAT Total 0.011753 Bwd IAT Mean 0.117046 Bwd IAT Std 0.014825 Bwd IAT Max 0.035911 Bwd IAT Min 0.095276 Fwd Header Length 0.175209 Bwd Header Length 0.155082 Fwd Packets/s -0.193202 Bwd Packets/s -0.213609 Packet Length Max 0.185374 Packet Length Mean 0.113642 Packet Length Std 0.190332 Packet Length Variance 0.115945 Avg Packet Size 0.099798 Avg Fwd Segment Size 0.132827 Avg Bwd Segment Size 0.037822 Subflow Fwd Packets 0.155158 Subflow Fwd Bytes 0.200397 Subflow Bwd Packets 0.098510 Subflow Bwd Bytes 0.038200 Init Fwd Win Bytes -0.113184 Init Bwd Win Bytes 0.024429 Fwd Act Data Packets 0.145694 Fwd Seg Size Min 0.148807 attack_id 0.016617 Name: Flow IAT Mean, dtype: float64 Row 14: Flow Duration 0.565126 Total Fwd Packets 0.229385 Total Backward Packets 0.192020 Fwd Packets Length Total 0.312305 Bwd Packets Length Total 0.142081 Fwd Packet Length Max 0.403239 Fwd Packet Length Mean 0.350501 Fwd Packet Length Std 0.379778 Bwd Packet Length Max 0.130569 Bwd Packet Length Mean 0.079935 Bwd Packet Length Std 0.185775 Flow Bytes/s -0.133443 Flow Packets/s -0.159992 Flow IAT Mean 0.488745 Flow IAT Std 1.000000 Flow IAT Max 0.344938 Flow IAT Min -0.093810 Fwd IAT Total 0.554799 Fwd IAT Mean 0.592206 Fwd IAT Std 0.082205 Fwd IAT Max 0.309412 Fwd IAT Min -0.014954 Bwd IAT Total 0.123286 Bwd IAT Mean -0.064555 Bwd IAT Std 0.127745 Bwd IAT Max 0.167128 Bwd IAT Min 0.168080 Fwd Header Length 0.267877 Bwd Header Length 0.241152 Fwd Packets/s -0.159956 Bwd Packets/s -0.171509 Packet Length Max 0.286885 Packet Length Mean 0.250089 Packet Length Std 0.261937 Packet Length Variance 0.259776 Avg Packet Size 0.238214 Avg Fwd Segment Size 0.350501 Avg Bwd Segment Size 0.079935 Subflow Fwd Packets 0.229385 Subflow Fwd Bytes 0.312305 Subflow Bwd Packets 0.192020 Subflow Bwd Bytes 0.142081 Init Fwd Win Bytes -0.153843 Init Bwd Win Bytes 0.022531 Fwd Act Data Packets 0.130074 Fwd Seg Size Min 0.246742 attack_id 0.012121 Name: Flow IAT Std, dtype: float64 Row 15: Flow Duration 0.530732 Total Fwd Packets 0.198171 Total Backward Packets 0.129965 Fwd Packets Length Total 0.232288 Bwd Packets Length Total 0.117542 Fwd Packet Length Max 0.256882 Fwd Packet Length Mean 0.116476 Fwd Packet Length Std 0.219872 Bwd Packet Length Max 0.291129 Bwd Packet Length Mean 0.132577 Bwd Packet Length Std 0.307785 Flow Bytes/s -0.175313 Flow Packets/s -0.195863 Flow IAT Mean 0.480091 Flow IAT Std 0.344938 Flow IAT Max 1.000000 Flow IAT Min -0.089186 Fwd IAT Total 0.418495 Fwd IAT Mean 0.376253 Fwd IAT Std -0.023027 Fwd IAT Max 0.889802 Fwd IAT Min 0.120750 Bwd IAT Total -0.048240 Bwd IAT Mean 0.500898 Bwd IAT Std -0.027571 Bwd IAT Max -0.035833 Bwd IAT Min 0.136607 Fwd Header Length 0.316224 Bwd Header Length 0.231894 Fwd Packets/s -0.196603 Bwd Packets/s -0.220567 Packet Length Max 0.313121 Packet Length Mean 0.168161 Packet Length Std 0.305920 Packet Length Variance 0.227996 Avg Packet Size 0.138955 Avg Fwd Segment Size 0.116476 Avg Bwd Segment Size 0.132577 Subflow Fwd Packets 0.198171 Subflow Fwd Bytes 0.232288 Subflow Bwd Packets 0.129965 Subflow Bwd Bytes 0.117542 Init Fwd Win Bytes -0.107938 Init Bwd Win Bytes -0.014380 Fwd Act Data Packets 0.185398 Fwd Seg Size Min 0.162000 attack_id -0.049590 Name: Flow IAT Max, dtype: float64 Row 16: Flow Duration -0.087898 Total Fwd Packets -0.165506 Total Backward Packets -0.113239 Fwd Packets Length Total -0.146314 Bwd Packets Length Total -0.075916 Fwd Packet Length Max -0.159033 Fwd Packet Length Mean -0.096717 Fwd Packet Length Std -0.162102 Bwd Packet Length Max -0.128903 Bwd Packet Length Mean -0.045374 Bwd Packet Length Std -0.161815 Flow Bytes/s -0.041821 Flow Packets/s -0.060208 Flow IAT Mean -0.092538 Flow IAT Std -0.093810 Flow IAT Max -0.089186 Flow IAT Min 1.000000 Fwd IAT Total -0.071933 Fwd IAT Mean -0.072768 Fwd IAT Std -0.031436 Fwd IAT Max -0.081323 Fwd IAT Min 0.038207 Bwd IAT Total -0.053105 Bwd IAT Mean -0.054524 Bwd IAT Std -0.055326 Bwd IAT Max -0.056652 Bwd IAT Min -0.050071 Fwd Header Length -0.202540 Bwd Header Length -0.169858 Fwd Packets/s -0.060445 Bwd Packets/s -0.024535 Packet Length Max -0.170442 Packet Length Mean -0.099794 Packet Length Std -0.160214 Packet Length Variance -0.152197 Avg Packet Size -0.068930 Avg Fwd Segment Size -0.096717 Avg Bwd Segment Size -0.045374 Subflow Fwd Packets -0.165506 Subflow Fwd Bytes -0.146314 Subflow Bwd Packets -0.113239 Subflow Bwd Bytes -0.075916 Init Fwd Win Bytes -0.073047 Init Bwd Win Bytes -0.040549 Fwd Act Data Packets -0.130984 Fwd Seg Size Min -0.309198 attack_id -0.098165 Name: Flow IAT Min, dtype: float64 Row 17: Flow Duration 0.780641 Total Fwd Packets 0.396678 Total Backward Packets 0.272520 Fwd Packets Length Total 0.350136 Bwd Packets Length Total 0.261268 Fwd Packet Length Max 0.383010 Fwd Packet Length Mean 0.321079 Fwd Packet Length Std 0.333408 Bwd Packet Length Max 0.192159 Bwd Packet Length Mean 0.070620 Bwd Packet Length Std 0.189628 Flow Bytes/s -0.142582 Flow Packets/s -0.169590 Flow IAT Mean 0.448291 Flow IAT Std 0.554799 Flow IAT Max 0.418495 Flow IAT Min -0.071933 Fwd IAT Total 1.000000 Fwd IAT Mean 0.642652 Fwd IAT Std 0.319268 Fwd IAT Max 0.460709 Fwd IAT Min 0.004437 Bwd IAT Total 0.133061 Bwd IAT Mean -0.059166 Bwd IAT Std 0.248535 Bwd IAT Max 0.211641 Bwd IAT Min 0.050383 Fwd Header Length 0.375993 Bwd Header Length 0.283238 Fwd Packets/s -0.169231 Bwd Packets/s -0.181755 Packet Length Max 0.270836 Packet Length Mean 0.206450 Packet Length Std 0.215716 Packet Length Variance 0.206249 Avg Packet Size 0.178638 Avg Fwd Segment Size 0.321079 Avg Bwd Segment Size 0.070620 Subflow Fwd Packets 0.396678 Subflow Fwd Bytes 0.350136 Subflow Bwd Packets 0.272520 Subflow Bwd Bytes 0.261268 Init Fwd Win Bytes -0.131350 Init Bwd Win Bytes 0.291423 Fwd Act Data Packets 0.296800 Fwd Seg Size Min 0.204756 attack_id -0.017367 Name: Fwd IAT Total, dtype: float64 Row 18: Flow Duration 0.518293 Total Fwd Packets 0.244505 Total Backward Packets 0.145574 Fwd Packets Length Total 0.293066 Bwd Packets Length Total 0.119108 Fwd Packet Length Max 0.364603 Fwd Packet Length Mean 0.314509 Fwd Packet Length Std 0.330904 Bwd Packet Length Max 0.110592 Bwd Packet Length Mean 0.029596 Bwd Packet Length Std 0.144140 Flow Bytes/s -0.143920 Flow Packets/s -0.160047 Flow IAT Mean 0.623333 Flow IAT Std 0.592206 Flow IAT Max 0.376253 Flow IAT Min -0.072768 Fwd IAT Total 0.642652 Fwd IAT Mean 1.000000 Fwd IAT Std 0.112909 Fwd IAT Max 0.432296 Fwd IAT Min -0.009545 Bwd IAT Total 0.059535 Bwd IAT Mean -0.027191 Bwd IAT Std 0.095992 Bwd IAT Max 0.073118 Bwd IAT Min 0.079706 Fwd Header Length 0.260631 Bwd Header Length 0.198780 Fwd Packets/s -0.158858 Bwd Packets/s -0.176949 Packet Length Max 0.224164 Packet Length Mean 0.176049 Packet Length Std 0.208747 Packet Length Variance 0.175577 Avg Packet Size 0.158530 Avg Fwd Segment Size 0.314509 Avg Bwd Segment Size 0.029596 Subflow Fwd Packets 0.244505 Subflow Fwd Bytes 0.293066 Subflow Bwd Packets 0.145574 Subflow Bwd Bytes 0.119108 Init Fwd Win Bytes -0.124701 Init Bwd Win Bytes 0.104470 Fwd Act Data Packets 0.180328 Fwd Seg Size Min 0.221853 attack_id -0.019111 Name: Fwd IAT Mean, dtype: float64 Row 19: Flow Duration 0.262062 Total Fwd Packets 0.526483 Total Backward Packets 0.513079 Fwd Packets Length Total 0.413410 Bwd Packets Length Total 0.606764 Fwd Packet Length Max 0.324852 Fwd Packet Length Mean 0.285779 Fwd Packet Length Std 0.229975 Bwd Packet Length Max 0.400185 Bwd Packet Length Mean 0.257838 Bwd Packet Length Std 0.315058 Flow Bytes/s -0.069871 Flow Packets/s -0.110826 Flow IAT Mean 0.024000 Flow IAT Std 0.082205 Flow IAT Max -0.023027 Flow IAT Min -0.031436 Fwd IAT Total 0.319268 Fwd IAT Mean 0.112909 Fwd IAT Std 1.000000 Fwd IAT Max 0.005651 Fwd IAT Min 0.066522 Bwd IAT Total 0.502530 Bwd IAT Mean -0.050013 Bwd IAT Std 0.594384 Bwd IAT Max 0.583120 Bwd IAT Min -0.062677 Fwd Header Length 0.402424 Bwd Header Length 0.395598 Fwd Packets/s -0.110018 Bwd Packets/s -0.098129 Packet Length Max 0.345060 Packet Length Mean 0.272211 Packet Length Std 0.239862 Packet Length Variance 0.250222 Avg Packet Size 0.228855 Avg Fwd Segment Size 0.285779 Avg Bwd Segment Size 0.257838 Subflow Fwd Packets 0.526483 Subflow Fwd Bytes 0.413410 Subflow Bwd Packets 0.513079 Subflow Bwd Bytes 0.606764 Init Fwd Win Bytes -0.037134 Init Bwd Win Bytes 0.699467 Fwd Act Data Packets 0.485163 Fwd Seg Size Min -0.002423 attack_id -0.121112 Name: Fwd IAT Std, dtype: float64 Row 20: Flow Duration 0.359788 Total Fwd Packets 0.225066 Total Backward Packets 0.106457 Fwd Packets Length Total 0.292654 Bwd Packets Length Total 0.145319 Fwd Packet Length Max 0.323373 Fwd Packet Length Mean 0.178340 Fwd Packet Length Std 0.270654 Bwd Packet Length Max 0.303202 Bwd Packet Length Mean 0.145865 Bwd Packet Length Std 0.304261 Flow Bytes/s -0.155532 Flow Packets/s -0.171152 Flow IAT Mean 0.401846 Flow IAT Std 0.309412 Flow IAT Max 0.889802 Flow IAT Min -0.081323 Fwd IAT Total 0.460709 Fwd IAT Mean 0.432296 Fwd IAT Std 0.005651 Fwd IAT Max 1.000000 Fwd IAT Min 0.101149 Bwd IAT Total -0.045330 Bwd IAT Mean 0.486238 Bwd IAT Std -0.004700 Bwd IAT Max -0.043031 Bwd IAT Min 0.095182 Fwd Header Length 0.352172 Bwd Header Length 0.215404 Fwd Packets/s -0.171523 Bwd Packets/s -0.194523 Packet Length Max 0.340404 Packet Length Mean 0.205559 Packet Length Std 0.315372 Packet Length Variance 0.253050 Avg Packet Size 0.175532 Avg Fwd Segment Size 0.178340 Avg Bwd Segment Size 0.145865 Subflow Fwd Packets 0.225066 Subflow Fwd Bytes 0.292654 Subflow Bwd Packets 0.106457 Subflow Bwd Bytes 0.145319 Init Fwd Win Bytes -0.092624 Init Bwd Win Bytes 0.029922 Fwd Act Data Packets 0.221819 Fwd Seg Size Min 0.178686 attack_id -0.085004 Name: Fwd IAT Max, dtype: float64 Row 21: Flow Duration 0.042262 Total Fwd Packets 0.121484 Total Backward Packets 0.181549 Fwd Packets Length Total 0.101196 Bwd Packets Length Total 0.152866 Fwd Packet Length Max 0.067656 Fwd Packet Length Mean 0.065737 Fwd Packet Length Std 0.093723 Bwd Packet Length Max 0.239714 Bwd Packet Length Mean 0.143203 Bwd Packet Length Std 0.230290 Flow Bytes/s 0.036841 Flow Packets/s 0.025983 Flow IAT Mean 0.032743 Flow IAT Std -0.014954 Flow IAT Max 0.120750 Flow IAT Min 0.038207 Fwd IAT Total 0.004437 Fwd IAT Mean -0.009545 Fwd IAT Std 0.066522 Fwd IAT Max 0.101149 Fwd IAT Min 1.000000 Bwd IAT Total 0.056121 Bwd IAT Mean 0.153464 Bwd IAT Std 0.025600 Bwd IAT Max 0.039186 Bwd IAT Min 0.007819 Fwd Header Length 0.159111 Bwd Header Length 0.217293 Fwd Packets/s 0.014289 Bwd Packets/s -0.013465 Packet Length Max 0.192029 Packet Length Mean 0.118852 Packet Length Std 0.184351 Packet Length Variance 0.146953 Avg Packet Size 0.094682 Avg Fwd Segment Size 0.065737 Avg Bwd Segment Size 0.143203 Subflow Fwd Packets 0.121484 Subflow Fwd Bytes 0.101196 Subflow Bwd Packets 0.181549 Subflow Bwd Bytes 0.152866 Init Fwd Win Bytes 0.077115 Init Bwd Win Bytes -0.004198 Fwd Act Data Packets 0.112142 Fwd Seg Size Min 0.057986 attack_id 0.108564 Name: Fwd IAT Min, dtype: float64 Row 22: Flow Duration 0.124080 Total Fwd Packets 0.324204 Total Backward Packets 0.476214 Fwd Packets Length Total 0.259034 Bwd Packets Length Total 0.539414 Fwd Packet Length Max 0.198299 Fwd Packet Length Mean 0.203509 Fwd Packet Length Std 0.160607 Bwd Packet Length Max 0.388287 Bwd Packet Length Mean 0.314509 Bwd Packet Length Std 0.356071 Flow Bytes/s -0.065239 Flow Packets/s -0.117358 Flow IAT Mean 0.011753 Flow IAT Std 0.123286 Flow IAT Max -0.048240 Flow IAT Min -0.053105 Fwd IAT Total 0.133061 Fwd IAT Mean 0.059535 Fwd IAT Std 0.502530 Fwd IAT Max -0.045330 Fwd IAT Min 0.056121 Bwd IAT Total 1.000000 Bwd IAT Mean -0.070098 Bwd IAT Std 0.587980 Bwd IAT Max 0.745204 Bwd IAT Min -0.031971 Fwd Header Length 0.248800 Bwd Header Length 0.367119 Fwd Packets/s -0.119275 Bwd Packets/s -0.090444 Packet Length Max 0.340607 Packet Length Mean 0.296223 Packet Length Std 0.284634 Packet Length Variance 0.320536 Avg Packet Size 0.266127 Avg Fwd Segment Size 0.203509 Avg Bwd Segment Size 0.314509 Subflow Fwd Packets 0.324204 Subflow Fwd Bytes 0.259034 Subflow Bwd Packets 0.476214 Subflow Bwd Bytes 0.539414 Init Fwd Win Bytes -0.045929 Init Bwd Win Bytes 0.470894 Fwd Act Data Packets 0.332644 Fwd Seg Size Min 0.017300 attack_id 0.046394 Name: Bwd IAT Total, dtype: float64 Row 23: Flow Duration 0.020241 Total Fwd Packets 0.192164 Total Backward Packets 0.163481 Fwd Packets Length Total 0.183088 Bwd Packets Length Total 0.119134 Fwd Packet Length Max 0.113974 Fwd Packet Length Mean 0.018704 Fwd Packet Length Std 0.077178 Bwd Packet Length Max 0.308447 Bwd Packet Length Mean 0.216200 Bwd Packet Length Std 0.264339 Flow Bytes/s -0.143259 Flow Packets/s -0.147376 Flow IAT Mean 0.117046 Flow IAT Std -0.064555 Flow IAT Max 0.500898 Flow IAT Min -0.054524 Fwd IAT Total -0.059166 Fwd IAT Mean -0.027191 Fwd IAT Std -0.050013 Fwd IAT Max 0.486238 Fwd IAT Min 0.153464 Bwd IAT Total -0.070098 Bwd IAT Mean 1.000000 Bwd IAT Std -0.066464 Bwd IAT Max -0.062794 Bwd IAT Min 0.060752 Fwd Header Length 0.344856 Bwd Header Length 0.306564 Fwd Packets/s -0.147670 Bwd Packets/s -0.171037 Packet Length Max 0.267974 Packet Length Mean 0.169775 Packet Length Std 0.228484 Packet Length Variance 0.204808 Avg Packet Size 0.136195 Avg Fwd Segment Size 0.018704 Avg Bwd Segment Size 0.216200 Subflow Fwd Packets 0.192164 Subflow Fwd Bytes 0.183088 Subflow Bwd Packets 0.163481 Subflow Bwd Bytes 0.119134 Init Fwd Win Bytes -0.069465 Init Bwd Win Bytes -0.046640 Fwd Act Data Packets 0.291875 Fwd Seg Size Min 0.057588 attack_id -0.032041 Name: Bwd IAT Mean, dtype: float64 Row 24: Flow Duration 0.206601 Total Fwd Packets 0.448569 Total Backward Packets 0.512708 Fwd Packets Length Total 0.339885 Bwd Packets Length Total 0.601234 Fwd Packet Length Max 0.289248 Fwd Packet Length Mean 0.242220 Fwd Packet Length Std 0.221190 Bwd Packet Length Max 0.415092 Bwd Packet Length Mean 0.319511 Bwd Packet Length Std 0.361693 Flow Bytes/s -0.055435 Flow Packets/s -0.115618 Flow IAT Mean 0.014825 Flow IAT Std 0.127745 Flow IAT Max -0.027571 Flow IAT Min -0.055326 Fwd IAT Total 0.248535 Fwd IAT Mean 0.095992 Fwd IAT Std 0.594384 Fwd IAT Max -0.004700 Fwd IAT Min 0.025600 Bwd IAT Total 0.587980 Bwd IAT Mean -0.066464 Bwd IAT Std 1.000000 Bwd IAT Max 0.722582 Bwd IAT Min -0.017976 Fwd Header Length 0.360276 Bwd Header Length 0.409684 Fwd Packets/s -0.117894 Bwd Packets/s -0.098808 Packet Length Max 0.382150 Packet Length Mean 0.309635 Packet Length Std 0.302866 Packet Length Variance 0.323665 Avg Packet Size 0.277124 Avg Fwd Segment Size 0.242220 Avg Bwd Segment Size 0.319511 Subflow Fwd Packets 0.448569 Subflow Fwd Bytes 0.339885 Subflow Bwd Packets 0.512708 Subflow Bwd Bytes 0.601234 Init Fwd Win Bytes -0.068592 Init Bwd Win Bytes 0.558057 Fwd Act Data Packets 0.408957 Fwd Seg Size Min 0.029104 attack_id 0.013587 Name: Bwd IAT Std, dtype: float64 Row 25: Flow Duration 0.209195 Total Fwd Packets 0.410429 Total Backward Packets 0.521269 Fwd Packets Length Total 0.281791 Bwd Packets Length Total 0.637725 Fwd Packet Length Max 0.215992 Fwd Packet Length Mean 0.166386 Fwd Packet Length Std 0.154433 Bwd Packet Length Max 0.441702 Bwd Packet Length Mean 0.364072 Bwd Packet Length Std 0.404445 Flow Bytes/s -0.083577 Flow Packets/s -0.133601 Flow IAT Mean 0.035911 Flow IAT Std 0.167128 Flow IAT Max -0.035833 Flow IAT Min -0.056652 Fwd IAT Total 0.211641 Fwd IAT Mean 0.073118 Fwd IAT Std 0.583120 Fwd IAT Max -0.043031 Fwd IAT Min 0.039186 Bwd IAT Total 0.745204 Bwd IAT Mean -0.062794 Bwd IAT Std 0.722582 Bwd IAT Max 1.000000 Bwd IAT Min -0.033679 Fwd Header Length 0.321280 Bwd Header Length 0.406998 Fwd Packets/s -0.135456 Bwd Packets/s -0.116905 Packet Length Max 0.390923 Packet Length Mean 0.310833 Packet Length Std 0.318395 Packet Length Variance 0.365282 Avg Packet Size 0.279261 Avg Fwd Segment Size 0.166386 Avg Bwd Segment Size 0.364072 Subflow Fwd Packets 0.410429 Subflow Fwd Bytes 0.281791 Subflow Bwd Packets 0.521269 Subflow Bwd Bytes 0.637725 Init Fwd Win Bytes -0.070402 Init Bwd Win Bytes 0.513751 Fwd Act Data Packets 0.377197 Fwd Seg Size Min 0.011774 attack_id 0.098360 Name: Bwd IAT Max, dtype: float64 Row 26: Flow Duration 0.110107 Total Fwd Packets 0.098966 Total Backward Packets 0.075412 Fwd Packets Length Total 0.159474 Bwd Packets Length Total 0.043402 Fwd Packet Length Max 0.317312 Fwd Packet Length Mean 0.315777 Fwd Packet Length Std 0.378219 Bwd Packet Length Max 0.021272 Bwd Packet Length Mean 0.069716 Bwd Packet Length Std 0.080120 Flow Bytes/s -0.052498 Flow Packets/s -0.069277 Flow IAT Mean 0.095276 Flow IAT Std 0.168080 Flow IAT Max 0.136607 Flow IAT Min -0.050071 Fwd IAT Total 0.050383 Fwd IAT Mean 0.079706 Fwd IAT Std -0.062677 Fwd IAT Max 0.095182 Fwd IAT Min 0.007819 Bwd IAT Total -0.031971 Bwd IAT Mean 0.060752 Bwd IAT Std -0.017976 Bwd IAT Max -0.033679 Bwd IAT Min 1.000000 Fwd Header Length 0.161279 Bwd Header Length 0.094064 Fwd Packets/s -0.072123 Bwd Packets/s -0.068265 Packet Length Max 0.165264 Packet Length Mean 0.168417 Packet Length Std 0.189050 Packet Length Variance 0.198223 Avg Packet Size 0.163039 Avg Fwd Segment Size 0.315777 Avg Bwd Segment Size 0.069716 Subflow Fwd Packets 0.098966 Subflow Fwd Bytes 0.159474 Subflow Bwd Packets 0.075412 Subflow Bwd Bytes 0.043402 Init Fwd Win Bytes 0.246999 Init Bwd Win Bytes 0.064453 Fwd Act Data Packets -0.009818 Fwd Seg Size Min 0.107613 attack_id -0.056871 Name: Bwd IAT Min, dtype: float64 Row 27: Flow Duration 0.330307 Total Fwd Packets 0.806061 Total Backward Packets 0.615261 Fwd Packets Length Total 0.570863 Bwd Packets Length Total 0.488606 Fwd Packet Length Max 0.580049 Fwd Packet Length Mean 0.467705 Fwd Packet Length Std 0.530178 Bwd Packet Length Max 0.566809 Bwd Packet Length Mean 0.398443 Bwd Packet Length Std 0.534625 Flow Bytes/s -0.082406 Flow Packets/s -0.153481 Flow IAT Mean 0.175209 Flow IAT Std 0.267877 Flow IAT Max 0.316224 Flow IAT Min -0.202540 Fwd IAT Total 0.375993 Fwd IAT Mean 0.260631 Fwd IAT Std 0.402424 Fwd IAT Max 0.352172 Fwd IAT Min 0.159111 Bwd IAT Total 0.248800 Bwd IAT Mean 0.344856 Bwd IAT Std 0.360276 Bwd IAT Max 0.321280 Bwd IAT Min 0.161279 Fwd Header Length 1.000000 Bwd Header Length 0.724390 Fwd Packets/s -0.155406 Bwd Packets/s -0.148240 Packet Length Max 0.621847 Packet Length Mean 0.500767 Packet Length Std 0.539616 Packet Length Variance 0.500078 Avg Packet Size 0.443692 Avg Fwd Segment Size 0.467705 Avg Bwd Segment Size 0.398443 Subflow Fwd Packets 0.806061 Subflow Fwd Bytes 0.570863 Subflow Bwd Packets 0.615261 Subflow Bwd Bytes 0.488606 Init Fwd Win Bytes 0.013206 Init Bwd Win Bytes 0.424312 Fwd Act Data Packets 0.667525 Fwd Seg Size Min 0.327892 attack_id -0.021366 Name: Fwd Header Length, dtype: float64 Row 28: Flow Duration 0.294374 Total Fwd Packets 0.711120 Total Backward Packets 0.846017 Fwd Packets Length Total 0.489739 Bwd Packets Length Total 0.559730 Fwd Packet Length Max 0.468355 Fwd Packet Length Mean 0.462312 Fwd Packet Length Std 0.456745 Bwd Packet Length Max 0.638425 Bwd Packet Length Mean 0.530512 Bwd Packet Length Std 0.606914 Flow Bytes/s 0.010898 Flow Packets/s -0.089666 Flow IAT Mean 0.155082 Flow IAT Std 0.241152 Flow IAT Max 0.231894 Flow IAT Min -0.169858 Fwd IAT Total 0.283238 Fwd IAT Mean 0.198780 Fwd IAT Std 0.395598 Fwd IAT Max 0.215404 Fwd IAT Min 0.217293 Bwd IAT Total 0.367119 Bwd IAT Mean 0.306564 Bwd IAT Std 0.409684 Bwd IAT Max 0.406998 Bwd IAT Min 0.094064 Fwd Header Length 0.724390 Bwd Header Length 1.000000 Fwd Packets/s -0.108921 Bwd Packets/s -0.065203 Packet Length Max 0.630535 Packet Length Mean 0.579675 Packet Length Std 0.589688 Packet Length Variance 0.504483 Avg Packet Size 0.538469 Avg Fwd Segment Size 0.462312 Avg Bwd Segment Size 0.530512 Subflow Fwd Packets 0.711120 Subflow Fwd Bytes 0.489739 Subflow Bwd Packets 0.846017 Subflow Bwd Bytes 0.559730 Init Fwd Win Bytes -0.039221 Init Bwd Win Bytes 0.380560 Fwd Act Data Packets 0.616584 Fwd Seg Size Min 0.264664 attack_id -0.029029 Name: Bwd Header Length, dtype: float64 Row 29: Flow Duration -0.201649 Total Fwd Packets -0.141910 Total Backward Packets -0.068510 Fwd Packets Length Total -0.072082 Bwd Packets Length Total -0.064800 Fwd Packet Length Max -0.024095 Fwd Packet Length Mean 0.065484 Fwd Packet Length Std 0.035362 Bwd Packet Length Max -0.079164 Bwd Packet Length Mean 0.011930 Bwd Packet Length Std -0.060242 Flow Bytes/s 0.382884 Flow Packets/s 0.835874 Flow IAT Mean -0.193202 Flow IAT Std -0.159956 Flow IAT Max -0.196603 Flow IAT Min -0.060445 Fwd IAT Total -0.169231 Fwd IAT Mean -0.158858 Fwd IAT Std -0.110018 Fwd IAT Max -0.171523 Fwd IAT Min 0.014289 Bwd IAT Total -0.119275 Bwd IAT Mean -0.147670 Bwd IAT Std -0.117894 Bwd IAT Max -0.135456 Bwd IAT Min -0.072123 Fwd Header Length -0.155406 Bwd Header Length -0.108921 Fwd Packets/s 1.000000 Bwd Packets/s 0.276758 Packet Length Max -0.059588 Packet Length Mean 0.004687 Packet Length Std -0.027922 Packet Length Variance -0.020036 Avg Packet Size 0.029423 Avg Fwd Segment Size 0.065484 Avg Bwd Segment Size 0.011930 Subflow Fwd Packets -0.141910 Subflow Fwd Bytes -0.072082 Subflow Bwd Packets -0.068510 Subflow Bwd Bytes -0.064800 Init Fwd Win Bytes 0.188806 Init Bwd Win Bytes -0.073528 Fwd Act Data Packets -0.143159 Fwd Seg Size Min -0.127855 attack_id 0.037160 Name: Fwd Packets/s, dtype: float64 Row 30: Flow Duration -0.215691 Total Fwd Packets -0.126428 Total Backward Packets -0.013203 Fwd Packets Length Total -0.022332 Bwd Packets Length Total -0.046593 Fwd Packet Length Max -0.034213 Fwd Packet Length Mean 0.053065 Fwd Packet Length Std -0.014289 Bwd Packet Length Max -0.015688 Bwd Packet Length Mean 0.105039 Bwd Packet Length Std -0.021202 Flow Bytes/s 0.383061 Flow Packets/s 0.258343 Flow IAT Mean -0.213609 Flow IAT Std -0.171509 Flow IAT Max -0.220567 Flow IAT Min -0.024535 Fwd IAT Total -0.181755 Fwd IAT Mean -0.176949 Fwd IAT Std -0.098129 Fwd IAT Max -0.194523 Fwd IAT Min -0.013465 Bwd IAT Total -0.090444 Bwd IAT Mean -0.171037 Bwd IAT Std -0.098808 Bwd IAT Max -0.116905 Bwd IAT Min -0.068265 Fwd Header Length -0.148240 Bwd Header Length -0.065203 Fwd Packets/s 0.276758 Bwd Packets/s 1.000000 Packet Length Max -0.021899 Packet Length Mean 0.088190 Packet Length Std 0.009359 Packet Length Variance -0.024543 Avg Packet Size 0.116140 Avg Fwd Segment Size 0.053065 Avg Bwd Segment Size 0.105039 Subflow Fwd Packets -0.126428 Subflow Fwd Bytes -0.022332 Subflow Bwd Packets -0.013203 Subflow Bwd Bytes -0.046593 Init Fwd Win Bytes 0.130409 Init Bwd Win Bytes -0.069479 Fwd Act Data Packets -0.095185 Fwd Seg Size Min -0.072828 attack_id -0.080766 Name: Bwd Packets/s, dtype: float64 Row 31: Flow Duration 0.244895 Total Fwd Packets 0.591023 Total Backward Packets 0.596776 Fwd Packets Length Total 0.680678 Bwd Packets Length Total 0.601459 Fwd Packet Length Max 0.693306 Fwd Packet Length Mean 0.611326 Fwd Packet Length Std 0.634520 Bwd Packet Length Max 0.916820 Bwd Packet Length Mean 0.702215 Bwd Packet Length Std 0.866421 Flow Bytes/s 0.031106 Flow Packets/s -0.048260 Flow IAT Mean 0.185374 Flow IAT Std 0.286885 Flow IAT Max 0.313121 Flow IAT Min -0.170442 Fwd IAT Total 0.270836 Fwd IAT Mean 0.224164 Fwd IAT Std 0.345060 Fwd IAT Max 0.340404 Fwd IAT Min 0.192029 Bwd IAT Total 0.340607 Bwd IAT Mean 0.267974 Bwd IAT Std 0.382150 Bwd IAT Max 0.390923 Bwd IAT Min 0.165264 Fwd Header Length 0.621847 Bwd Header Length 0.630535 Fwd Packets/s -0.059588 Bwd Packets/s -0.021899 Packet Length Max 1.000000 Packet Length Mean 0.817534 Packet Length Std 0.914118 Packet Length Variance 0.770931 Avg Packet Size 0.778213 Avg Fwd Segment Size 0.611326 Avg Bwd Segment Size 0.702215 Subflow Fwd Packets 0.591023 Subflow Fwd Bytes 0.680678 Subflow Bwd Packets 0.596776 Subflow Bwd Bytes 0.601459 Init Fwd Win Bytes 0.171932 Init Bwd Win Bytes 0.393128 Fwd Act Data Packets 0.558498 Fwd Seg Size Min 0.154635 attack_id -0.061398 Name: Packet Length Max, dtype: float64 Row 32: Flow Duration 0.160033 Total Fwd Packets 0.499634 Total Backward Packets 0.570523 Fwd Packets Length Total 0.577650 Bwd Packets Length Total 0.530830 Fwd Packet Length Max 0.624118 Fwd Packet Length Mean 0.666252 Fwd Packet Length Std 0.595388 Bwd Packet Length Max 0.736857 Bwd Packet Length Mean 0.808163 Bwd Packet Length Std 0.730641 Flow Bytes/s 0.093525 Flow Packets/s 0.015124 Flow IAT Mean 0.113642 Flow IAT Std 0.250089 Flow IAT Max 0.168161 Flow IAT Min -0.099794 Fwd IAT Total 0.206450 Fwd IAT Mean 0.176049 Fwd IAT Std 0.272211 Fwd IAT Max 0.205559 Fwd IAT Min 0.118852 Bwd IAT Total 0.296223 Bwd IAT Mean 0.169775 Bwd IAT Std 0.309635 Bwd IAT Max 0.310833 Bwd IAT Min 0.168417 Fwd Header Length 0.500767 Bwd Header Length 0.579675 Fwd Packets/s 0.004687 Bwd Packets/s 0.088190 Packet Length Max 0.817534 Packet Length Mean 1.000000 Packet Length Std 0.778409 Packet Length Variance 0.731589 Avg Packet Size 0.965009 Avg Fwd Segment Size 0.666252 Avg Bwd Segment Size 0.808163 Subflow Fwd Packets 0.499634 Subflow Fwd Bytes 0.577650 Subflow Bwd Packets 0.570523 Subflow Bwd Bytes 0.530830 Init Fwd Win Bytes 0.183537 Init Bwd Win Bytes 0.320431 Fwd Act Data Packets 0.481511 Fwd Seg Size Min 0.099643 attack_id -0.098992 Name: Packet Length Mean, dtype: float64 Row 33: Flow Duration 0.211278 Total Fwd Packets 0.500338 Total Backward Packets 0.552656 Fwd Packets Length Total 0.554814 Bwd Packets Length Total 0.485171 Fwd Packet Length Max 0.605821 Fwd Packet Length Mean 0.567894 Fwd Packet Length Std 0.610741 Bwd Packet Length Max 0.850145 Bwd Packet Length Mean 0.682757 Bwd Packet Length Std 0.886948 Flow Bytes/s 0.062113 Flow Packets/s -0.015276 Flow IAT Mean 0.190332 Flow IAT Std 0.261937 Flow IAT Max 0.305920 Flow IAT Min -0.160214 Fwd IAT Total 0.215716 Fwd IAT Mean 0.208747 Fwd IAT Std 0.239862 Fwd IAT Max 0.315372 Fwd IAT Min 0.184351 Bwd IAT Total 0.284634 Bwd IAT Mean 0.228484 Bwd IAT Std 0.302866 Bwd IAT Max 0.318395 Bwd IAT Min 0.189050 Fwd Header Length 0.539616 Bwd Header Length 0.589688 Fwd Packets/s -0.027922 Bwd Packets/s 0.009359 Packet Length Max 0.914118 Packet Length Mean 0.778409 Packet Length Std 1.000000 Packet Length Variance 0.693396 Avg Packet Size 0.764796 Avg Fwd Segment Size 0.567894 Avg Bwd Segment Size 0.682757 Subflow Fwd Packets 0.500338 Subflow Fwd Bytes 0.554814 Subflow Bwd Packets 0.552656 Subflow Bwd Bytes 0.485171 Init Fwd Win Bytes 0.201096 Init Bwd Win Bytes 0.287666 Fwd Act Data Packets 0.470960 Fwd Seg Size Min 0.152382 attack_id -0.097046 Name: Packet Length Std, dtype: float64 Row 34: Flow Duration 0.183152 Total Fwd Packets 0.456418 Total Backward Packets 0.457982 Fwd Packets Length Total 0.514811 Bwd Packets Length Total 0.521250 Fwd Packet Length Max 0.610889 Fwd Packet Length Mean 0.576243 Fwd Packet Length Std 0.627071 Bwd Packet Length Max 0.678489 Bwd Packet Length Mean 0.724085 Bwd Packet Length Std 0.685478 Flow Bytes/s -0.019063 Flow Packets/s -0.014598 Flow IAT Mean 0.115945 Flow IAT Std 0.259776 Flow IAT Max 0.227996 Flow IAT Min -0.152197 Fwd IAT Total 0.206249 Fwd IAT Mean 0.175577 Fwd IAT Std 0.250222 Fwd IAT Max 0.253050 Fwd IAT Min 0.146953 Bwd IAT Total 0.320536 Bwd IAT Mean 0.204808 Bwd IAT Std 0.323665 Bwd IAT Max 0.365282 Bwd IAT Min 0.198223 Fwd Header Length 0.500078 Bwd Header Length 0.504483 Fwd Packets/s -0.020036 Bwd Packets/s -0.024543 Packet Length Max 0.770931 Packet Length Mean 0.731589 Packet Length Std 0.693396 Packet Length Variance 1.000000 Avg Packet Size 0.678704 Avg Fwd Segment Size 0.576243 Avg Bwd Segment Size 0.724085 Subflow Fwd Packets 0.456418 Subflow Fwd Bytes 0.514811 Subflow Bwd Packets 0.457982 Subflow Bwd Bytes 0.521250 Init Fwd Win Bytes 0.236211 Init Bwd Win Bytes 0.344496 Fwd Act Data Packets 0.378085 Fwd Seg Size Min 0.172997 attack_id -0.023701 Name: Packet Length Variance, dtype: float64 Row 35: Flow Duration 0.131517 Total Fwd Packets 0.444920 Total Backward Packets 0.537342 Fwd Packets Length Total 0.535458 Bwd Packets Length Total 0.484636 Fwd Packet Length Max 0.592635 Fwd Packet Length Mean 0.656824 Fwd Packet Length Std 0.563593 Bwd Packet Length Max 0.695708 Bwd Packet Length Mean 0.801836 Bwd Packet Length Std 0.701522 Flow Bytes/s 0.107747 Flow Packets/s 0.038576 Flow IAT Mean 0.099798 Flow IAT Std 0.238214 Flow IAT Max 0.138955 Flow IAT Min -0.068930 Fwd IAT Total 0.178638 Fwd IAT Mean 0.158530 Fwd IAT Std 0.228855 Fwd IAT Max 0.175532 Fwd IAT Min 0.094682 Bwd IAT Total 0.266127 Bwd IAT Mean 0.136195 Bwd IAT Std 0.277124 Bwd IAT Max 0.279261 Bwd IAT Min 0.163039 Fwd Header Length 0.443692 Bwd Header Length 0.538469 Fwd Packets/s 0.029423 Bwd Packets/s 0.116140 Packet Length Max 0.778213 Packet Length Mean 0.965009 Packet Length Std 0.764796 Packet Length Variance 0.678704 Avg Packet Size 1.000000 Avg Fwd Segment Size 0.656824 Avg Bwd Segment Size 0.801836 Subflow Fwd Packets 0.444920 Subflow Fwd Bytes 0.535458 Subflow Bwd Packets 0.537342 Subflow Bwd Bytes 0.484636 Init Fwd Win Bytes 0.178445 Init Bwd Win Bytes 0.274867 Fwd Act Data Packets 0.434184 Fwd Seg Size Min 0.058608 attack_id -0.103647 Name: Avg Packet Size, dtype: float64 Row 36: Flow Duration 0.223883 Total Fwd Packets 0.443021 Total Backward Packets 0.454429 Fwd Packets Length Total 0.673515 Bwd Packets Length Total 0.433777 Fwd Packet Length Max 0.876929 Fwd Packet Length Mean 1.000000 Fwd Packet Length Std 0.906266 Bwd Packet Length Max 0.389350 Bwd Packet Length Mean 0.430169 Bwd Packet Length Std 0.406844 Flow Bytes/s 0.088291 Flow Packets/s 0.079698 Flow IAT Mean 0.132827 Flow IAT Std 0.350501 Flow IAT Max 0.116476 Flow IAT Min -0.096717 Fwd IAT Total 0.321079 Fwd IAT Mean 0.314509 Fwd IAT Std 0.285779 Fwd IAT Max 0.178340 Fwd IAT Min 0.065737 Bwd IAT Total 0.203509 Bwd IAT Mean 0.018704 Bwd IAT Std 0.242220 Bwd IAT Max 0.166386 Bwd IAT Min 0.315777 Fwd Header Length 0.467705 Bwd Header Length 0.462312 Fwd Packets/s 0.065484 Bwd Packets/s 0.053065 Packet Length Max 0.611326 Packet Length Mean 0.666252 Packet Length Std 0.567894 Packet Length Variance 0.576243 Avg Packet Size 0.656824 Avg Fwd Segment Size 1.000000 Avg Bwd Segment Size 0.430169 Subflow Fwd Packets 0.443021 Subflow Fwd Bytes 0.673515 Subflow Bwd Packets 0.454429 Subflow Bwd Bytes 0.433777 Init Fwd Win Bytes 0.261404 Init Bwd Win Bytes 0.415983 Fwd Act Data Packets 0.323995 Fwd Seg Size Min 0.226251 attack_id -0.196100 Name: Avg Fwd Segment Size, dtype: float64 Row 37: Flow Duration 0.069250 Total Fwd Packets 0.389019 Total Backward Packets 0.530979 Fwd Packets Length Total 0.397051 Bwd Packets Length Total 0.556433 Fwd Packet Length Max 0.387529 Fwd Packet Length Mean 0.430169 Fwd Packet Length Std 0.376194 Bwd Packet Length Max 0.745937 Bwd Packet Length Mean 1.000000 Bwd Packet Length Std 0.713763 Flow Bytes/s 0.059257 Flow Packets/s 0.017235 Flow IAT Mean 0.037822 Flow IAT Std 0.079935 Flow IAT Max 0.132577 Flow IAT Min -0.045374 Fwd IAT Total 0.070620 Fwd IAT Mean 0.029596 Fwd IAT Std 0.257838 Fwd IAT Max 0.145865 Fwd IAT Min 0.143203 Bwd IAT Total 0.314509 Bwd IAT Mean 0.216200 Bwd IAT Std 0.319511 Bwd IAT Max 0.364072 Bwd IAT Min 0.069716 Fwd Header Length 0.398443 Bwd Header Length 0.530512 Fwd Packets/s 0.011930 Bwd Packets/s 0.105039 Packet Length Max 0.702215 Packet Length Mean 0.808163 Packet Length Std 0.682757 Packet Length Variance 0.724085 Avg Packet Size 0.801836 Avg Fwd Segment Size 0.430169 Avg Bwd Segment Size 1.000000 Subflow Fwd Packets 0.389019 Subflow Fwd Bytes 0.397051 Subflow Bwd Packets 0.530979 Subflow Bwd Bytes 0.556433 Init Fwd Win Bytes 0.125144 Init Bwd Win Bytes 0.277818 Fwd Act Data Packets 0.415084 Fwd Seg Size Min 0.017201 attack_id -0.085765 Name: Avg Bwd Segment Size, dtype: float64 Row 38: Flow Duration 0.344572 Total Fwd Packets 1.000000 Total Backward Packets 0.693930 Fwd Packets Length Total 0.580110 Bwd Packets Length Total 0.536628 Fwd Packet Length Max 0.535619 Fwd Packet Length Mean 0.443021 Fwd Packet Length Std 0.460252 Bwd Packet Length Max 0.559189 Bwd Packet Length Mean 0.389019 Bwd Packet Length Std 0.526333 Flow Bytes/s -0.056336 Flow Packets/s -0.142290 Flow IAT Mean 0.155158 Flow IAT Std 0.229385 Flow IAT Max 0.198171 Flow IAT Min -0.165506 Fwd IAT Total 0.396678 Fwd IAT Mean 0.244505 Fwd IAT Std 0.526483 Fwd IAT Max 0.225066 Fwd IAT Min 0.121484 Bwd IAT Total 0.324204 Bwd IAT Mean 0.192164 Bwd IAT Std 0.448569 Bwd IAT Max 0.410429 Bwd IAT Min 0.098966 Fwd Header Length 0.806061 Bwd Header Length 0.711120 Fwd Packets/s -0.141910 Bwd Packets/s -0.126428 Packet Length Max 0.591023 Packet Length Mean 0.499634 Packet Length Std 0.500338 Packet Length Variance 0.456418 Avg Packet Size 0.444920 Avg Fwd Segment Size 0.443021 Avg Bwd Segment Size 0.389019 Subflow Fwd Packets 1.000000 Subflow Fwd Bytes 0.580110 Subflow Bwd Packets 0.693930 Subflow Bwd Bytes 0.536628 Init Fwd Win Bytes 0.028631 Init Bwd Win Bytes 0.541543 Fwd Act Data Packets 0.755914 Fwd Seg Size Min 0.151001 attack_id -0.039465 Name: Subflow Fwd Packets, dtype: float64 Row 39: Flow Duration 0.266345 Total Fwd Packets 0.580110 Total Backward Packets 0.457674 Fwd Packets Length Total 1.000000 Bwd Packets Length Total 0.470164 Fwd Packet Length Max 0.796363 Fwd Packet Length Mean 0.673515 Fwd Packet Length Std 0.656346 Bwd Packet Length Max 0.547430 Bwd Packet Length Mean 0.397051 Bwd Packet Length Std 0.512406 Flow Bytes/s -0.001175 Flow Packets/s -0.068197 Flow IAT Mean 0.200397 Flow IAT Std 0.312305 Flow IAT Max 0.232288 Flow IAT Min -0.146314 Fwd IAT Total 0.350136 Fwd IAT Mean 0.293066 Fwd IAT Std 0.413410 Fwd IAT Max 0.292654 Fwd IAT Min 0.101196 Bwd IAT Total 0.259034 Bwd IAT Mean 0.183088 Bwd IAT Std 0.339885 Bwd IAT Max 0.281791 Bwd IAT Min 0.159474 Fwd Header Length 0.570863 Bwd Header Length 0.489739 Fwd Packets/s -0.072082 Bwd Packets/s -0.022332 Packet Length Max 0.680678 Packet Length Mean 0.577650 Packet Length Std 0.554814 Packet Length Variance 0.514811 Avg Packet Size 0.535458 Avg Fwd Segment Size 0.673515 Avg Bwd Segment Size 0.397051 Subflow Fwd Packets 0.580110 Subflow Fwd Bytes 1.000000 Subflow Bwd Packets 0.457674 Subflow Bwd Bytes 0.470164 Init Fwd Win Bytes 0.148935 Init Bwd Win Bytes 0.477821 Fwd Act Data Packets 0.559723 Fwd Seg Size Min 0.200562 attack_id -0.085335 Name: Subflow Fwd Bytes, dtype: float64 Row 40: Flow Duration 0.284508 Total Fwd Packets 0.693930 Total Backward Packets 1.000000 Fwd Packets Length Total 0.457674 Bwd Packets Length Total 0.667482 Fwd Packet Length Max 0.431439 Fwd Packet Length Mean 0.454429 Fwd Packet Length Std 0.418142 Bwd Packet Length Max 0.626985 Bwd Packet Length Mean 0.530979 Bwd Packet Length Std 0.589303 Flow Bytes/s 0.053830 Flow Packets/s -0.046278 Flow IAT Mean 0.098510 Flow IAT Std 0.192020 Flow IAT Max 0.129965 Flow IAT Min -0.113239 Fwd IAT Total 0.272520 Fwd IAT Mean 0.145574 Fwd IAT Std 0.513079 Fwd IAT Max 0.106457 Fwd IAT Min 0.181549 Bwd IAT Total 0.476214 Bwd IAT Mean 0.163481 Bwd IAT Std 0.512708 Bwd IAT Max 0.521269 Bwd IAT Min 0.075412 Fwd Header Length 0.615261 Bwd Header Length 0.846017 Fwd Packets/s -0.068510 Bwd Packets/s -0.013203 Packet Length Max 0.596776 Packet Length Mean 0.570523 Packet Length Std 0.552656 Packet Length Variance 0.457982 Avg Packet Size 0.537342 Avg Fwd Segment Size 0.454429 Avg Bwd Segment Size 0.530979 Subflow Fwd Packets 0.693930 Subflow Fwd Bytes 0.457674 Subflow Bwd Packets 1.000000 Subflow Bwd Bytes 0.667482 Init Fwd Win Bytes -0.018963 Init Bwd Win Bytes 0.501559 Fwd Act Data Packets 0.602988 Fwd Seg Size Min 0.124126 attack_id -0.064490 Name: Subflow Bwd Packets, dtype: float64 Row 41: Flow Duration 0.227557 Total Fwd Packets 0.536628 Total Backward Packets 0.667482 Fwd Packets Length Total 0.470164 Bwd Packets Length Total 1.000000 Fwd Packet Length Max 0.451904 Fwd Packet Length Mean 0.433777 Fwd Packet Length Std 0.401362 Bwd Packet Length Max 0.642817 Bwd Packet Length Mean 0.556433 Bwd Packet Length Std 0.560662 Flow Bytes/s -0.043166 Flow Packets/s -0.054143 Flow IAT Mean 0.038200 Flow IAT Std 0.142081 Flow IAT Max 0.117542 Flow IAT Min -0.075916 Fwd IAT Total 0.261268 Fwd IAT Mean 0.119108 Fwd IAT Std 0.606764 Fwd IAT Max 0.145319 Fwd IAT Min 0.152866 Bwd IAT Total 0.539414 Bwd IAT Mean 0.119134 Bwd IAT Std 0.601234 Bwd IAT Max 0.637725 Bwd IAT Min 0.043402 Fwd Header Length 0.488606 Bwd Header Length 0.559730 Fwd Packets/s -0.064800 Bwd Packets/s -0.046593 Packet Length Max 0.601459 Packet Length Mean 0.530830 Packet Length Std 0.485171 Packet Length Variance 0.521250 Avg Packet Size 0.484636 Avg Fwd Segment Size 0.433777 Avg Bwd Segment Size 0.556433 Subflow Fwd Packets 0.536628 Subflow Fwd Bytes 0.470164 Subflow Bwd Packets 0.667482 Subflow Bwd Bytes 1.000000 Init Fwd Win Bytes 0.058459 Init Bwd Win Bytes 0.626598 Fwd Act Data Packets 0.495346 Fwd Seg Size Min 0.053893 attack_id -0.035668 Name: Subflow Bwd Bytes, dtype: float64 Row 42: Flow Duration -0.136505 Total Fwd Packets 0.028631 Total Backward Packets -0.018963 Fwd Packets Length Total 0.148935 Bwd Packets Length Total 0.058459 Fwd Packet Length Max 0.250035 Fwd Packet Length Mean 0.261404 Fwd Packet Length Std 0.313570 Bwd Packet Length Max 0.082989 Bwd Packet Length Mean 0.125144 Bwd Packet Length Std 0.169575 Flow Bytes/s 0.062303 Flow Packets/s 0.192189 Flow IAT Mean -0.113184 Flow IAT Std -0.153843 Flow IAT Max -0.107938 Flow IAT Min -0.073047 Fwd IAT Total -0.131350 Fwd IAT Mean -0.124701 Fwd IAT Std -0.037134 Fwd IAT Max -0.092624 Fwd IAT Min 0.077115 Bwd IAT Total -0.045929 Bwd IAT Mean -0.069465 Bwd IAT Std -0.068592 Bwd IAT Max -0.070402 Bwd IAT Min 0.246999 Fwd Header Length 0.013206 Bwd Header Length -0.039221 Fwd Packets/s 0.188806 Bwd Packets/s 0.130409 Packet Length Max 0.171932 Packet Length Mean 0.183537 Packet Length Std 0.201096 Packet Length Variance 0.236211 Avg Packet Size 0.178445 Avg Fwd Segment Size 0.261404 Avg Bwd Segment Size 0.125144 Subflow Fwd Packets 0.028631 Subflow Fwd Bytes 0.148935 Subflow Bwd Packets -0.018963 Subflow Bwd Bytes 0.058459 Init Fwd Win Bytes 1.000000 Init Bwd Win Bytes 0.152317 Fwd Act Data Packets -0.036225 Fwd Seg Size Min 0.032016 attack_id -0.024570 Name: Init Fwd Win Bytes, dtype: float64 Row 43: Flow Duration 0.220855 Total Fwd Packets 0.541543 Total Backward Packets 0.501559 Fwd Packets Length Total 0.477821 Bwd Packets Length Total 0.626598 Fwd Packet Length Max 0.463152 Fwd Packet Length Mean 0.415983 Fwd Packet Length Std 0.390927 Bwd Packet Length Max 0.383273 Bwd Packet Length Mean 0.277818 Bwd Packet Length Std 0.315921 Flow Bytes/s -0.084479 Flow Packets/s -0.079840 Flow IAT Mean 0.024429 Flow IAT Std 0.022531 Flow IAT Max -0.014380 Flow IAT Min -0.040549 Fwd IAT Total 0.291423 Fwd IAT Mean 0.104470 Fwd IAT Std 0.699467 Fwd IAT Max 0.029922 Fwd IAT Min -0.004198 Bwd IAT Total 0.470894 Bwd IAT Mean -0.046640 Bwd IAT Std 0.558057 Bwd IAT Max 0.513751 Bwd IAT Min 0.064453 Fwd Header Length 0.424312 Bwd Header Length 0.380560 Fwd Packets/s -0.073528 Bwd Packets/s -0.069479 Packet Length Max 0.393128 Packet Length Mean 0.320431 Packet Length Std 0.287666 Packet Length Variance 0.344496 Avg Packet Size 0.274867 Avg Fwd Segment Size 0.415983 Avg Bwd Segment Size 0.277818 Subflow Fwd Packets 0.541543 Subflow Fwd Bytes 0.477821 Subflow Bwd Packets 0.501559 Subflow Bwd Bytes 0.626598 Init Fwd Win Bytes 0.152317 Init Bwd Win Bytes 1.000000 Fwd Act Data Packets 0.467861 Fwd Seg Size Min -0.034003 attack_id -0.188538 Name: Init Bwd Win Bytes, dtype: float64 Row 44: Flow Duration 0.245885 Total Fwd Packets 0.755914 Total Backward Packets 0.602988 Fwd Packets Length Total 0.559723 Bwd Packets Length Total 0.495346 Fwd Packet Length Max 0.397308 Fwd Packet Length Mean 0.323995 Fwd Packet Length Std 0.282448 Bwd Packet Length Max 0.585226 Bwd Packet Length Mean 0.415084 Bwd Packet Length Std 0.529967 Flow Bytes/s -0.040516 Flow Packets/s -0.139593 Flow IAT Mean 0.145694 Flow IAT Std 0.130074 Flow IAT Max 0.185398 Flow IAT Min -0.130984 Fwd IAT Total 0.296800 Fwd IAT Mean 0.180328 Fwd IAT Std 0.485163 Fwd IAT Max 0.221819 Fwd IAT Min 0.112142 Bwd IAT Total 0.332644 Bwd IAT Mean 0.291875 Bwd IAT Std 0.408957 Bwd IAT Max 0.377197 Bwd IAT Min -0.009818 Fwd Header Length 0.667525 Bwd Header Length 0.616584 Fwd Packets/s -0.143159 Bwd Packets/s -0.095185 Packet Length Max 0.558498 Packet Length Mean 0.481511 Packet Length Std 0.470960 Packet Length Variance 0.378085 Avg Packet Size 0.434184 Avg Fwd Segment Size 0.323995 Avg Bwd Segment Size 0.415084 Subflow Fwd Packets 0.755914 Subflow Fwd Bytes 0.559723 Subflow Bwd Packets 0.602988 Subflow Bwd Bytes 0.495346 Init Fwd Win Bytes -0.036225 Init Bwd Win Bytes 0.467861 Fwd Act Data Packets 1.000000 Fwd Seg Size Min 0.049872 attack_id -0.086071 Name: Fwd Act Data Packets, dtype: float64 Row 45: Flow Duration 0.172877 Total Fwd Packets 0.151001 Total Backward Packets 0.124126 Fwd Packets Length Total 0.200562 Bwd Packets Length Total 0.053893 Fwd Packet Length Max 0.267014 Fwd Packet Length Mean 0.226251 Fwd Packet Length Std 0.302648 Bwd Packet Length Max 0.082755 Bwd Packet Length Mean 0.017201 Bwd Packet Length Std 0.127629 Flow Bytes/s -0.078757 Flow Packets/s -0.120632 Flow IAT Mean 0.148807 Flow IAT Std 0.246742 Flow IAT Max 0.162000 Flow IAT Min -0.309198 Fwd IAT Total 0.204756 Fwd IAT Mean 0.221853 Fwd IAT Std -0.002423 Fwd IAT Max 0.178686 Fwd IAT Min 0.057986 Bwd IAT Total 0.017300 Bwd IAT Mean 0.057588 Bwd IAT Std 0.029104 Bwd IAT Max 0.011774 Bwd IAT Min 0.107613 Fwd Header Length 0.327892 Bwd Header Length 0.264664 Fwd Packets/s -0.127855 Bwd Packets/s -0.072828 Packet Length Max 0.154635 Packet Length Mean 0.099643 Packet Length Std 0.152382 Packet Length Variance 0.172997 Avg Packet Size 0.058608 Avg Fwd Segment Size 0.226251 Avg Bwd Segment Size 0.017201 Subflow Fwd Packets 0.151001 Subflow Fwd Bytes 0.200562 Subflow Bwd Packets 0.124126 Subflow Bwd Bytes 0.053893 Init Fwd Win Bytes 0.032016 Init Bwd Win Bytes -0.034003 Fwd Act Data Packets 0.049872 Fwd Seg Size Min 1.000000 attack_id 0.109920 Name: Fwd Seg Size Min, dtype: float64 Row 46: Flow Duration 0.026159 Total Fwd Packets -0.039465 Total Backward Packets -0.064490 Fwd Packets Length Total -0.085335 Bwd Packets Length Total -0.035668 Fwd Packet Length Max -0.114476 Fwd Packet Length Mean -0.196100 Fwd Packet Length Std -0.141036 Bwd Packet Length Max -0.064293 Bwd Packet Length Mean -0.085765 Bwd Packet Length Std -0.039774 Flow Bytes/s -0.014417 Flow Packets/s 0.040890 Flow IAT Mean 0.016617 Flow IAT Std 0.012121 Flow IAT Max -0.049590 Flow IAT Min -0.098165 Fwd IAT Total -0.017367 Fwd IAT Mean -0.019111 Fwd IAT Std -0.121112 Fwd IAT Max -0.085004 Fwd IAT Min 0.108564 Bwd IAT Total 0.046394 Bwd IAT Mean -0.032041 Bwd IAT Std 0.013587 Bwd IAT Max 0.098360 Bwd IAT Min -0.056871 Fwd Header Length -0.021366 Bwd Header Length -0.029029 Fwd Packets/s 0.037160 Bwd Packets/s -0.080766 Packet Length Max -0.061398 Packet Length Mean -0.098992 Packet Length Std -0.097046 Packet Length Variance -0.023701 Avg Packet Size -0.103647 Avg Fwd Segment Size -0.196100 Avg Bwd Segment Size -0.085765 Subflow Fwd Packets -0.039465 Subflow Fwd Bytes -0.085335 Subflow Bwd Packets -0.064490 Subflow Bwd Bytes -0.035668 Init Fwd Win Bytes -0.024570 Init Bwd Win Bytes -0.188538 Fwd Act Data Packets -0.086071 Fwd Seg Size Min 0.109920 attack_id 1.000000 Name: attack_id, dtype: float64
#Renaming columns to remove space and replace it with underscore (_)
sampled_cic_df.columns=[col.replace(' ','_') for col in sampled_cic_df.columns]
sampled_cic_df.columns
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
'Init_Bwd_Win_Bytes', 'Fwd_Act_Data_Packets', 'Fwd_Seg_Size_Min',
'ClassLabel', 'isMalicious', 'attack_id'],
dtype='object')
cic_df.columns=[col.replace(' ','_') for col in cic_df.columns]
columns_list=sampled_cic_df.columns.tolist()
sampled_cic_df.head()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Subflow_Fwd_Bytes | Subflow_Bwd_Packets | Subflow_Bwd_Bytes | Init_Fwd_Win_Bytes | Init_Bwd_Win_Bytes | Fwd_Act_Data_Packets | Fwd_Seg_Size_Min | ClassLabel | isMalicious | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 935.0 | 3.0 | 397.0 | 219.0 | 211.0 | 1.0 | 32.0 | Benign | 0 | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 63326.0 | 235.0 | 0.0 | 20.0 | Benign | 0 | 0 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 1144.0 | 7.0 | 1581.0 | 8192.0 | 62856.0 | 5.0 | 20.0 | Benign | 0 | 0 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 2064.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 5.0 | 8.0 | DDoS | 1 | 3 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 355.0 | 6.0 | 232.0 | 8192.0 | 123.0 | 3.0 | 20.0 | Benign | 0 | 0 |
5 rows × 49 columns
columns_equal_min_and_Q1=[]
columns_equal_Q1_and_Q3=[]
columns_equal_Q3_and_max=[]
for col in columns_list:
if(col!="isMalicious" and col!="ClassLabel" and col!="attack_id"):
min_value=sampled_cic_df[col].min()
p25_value=sampled_cic_df[col].quantile(0.25)
p75_value=sampled_cic_df[col].quantile(0.75)
max_value=sampled_cic_df[col].max()
if(min_value==p25_value):
columns_equal_min_and_Q1.append(col)
if(p25_value==p75_value):
columns_equal_Q1_and_Q3.append(col)
if(p75_value==max_value):
columns_equal_Q3_and_max.append(col)
print("List of features having equal min and Q1 value:\n",columns_equal_min_and_Q1)
List of features having equal min and Q1 value: ['Bwd_Packets_Length_Total', 'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max', 'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_IAT_Std', 'Fwd_IAT_Std', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Avg_Bwd_Segment_Size', 'Subflow_Bwd_Bytes', 'Fwd_Act_Data_Packets']
print("List of features having equal Q1 and Q3 value:\n",columns_equal_Q1_and_Q3)
List of features having equal Q1 and Q3 value: ['Init_Fwd_Win_Bytes', 'Fwd_Seg_Size_Min']
print("List of features having equal Q3 and max value:\n",columns_equal_Q3_and_max)
List of features having equal Q3 and max value: []
From the above results we observed: -
- Following features in the sampled dataset have equal minimum and Q1 value: -
- Bwd_Packets_Length_Total
- Fwd_Packet_Length_Std
- Bwd_Packet_Length_Max
- Bwd_Packet_Length_Mean
- Bwd_Packet_Length_Std
- Flow_IAT_Std
- Fwd_IAT_Std
- Bwd_IAT_Total
- Bwd_IAT_Mean
- Bwd_IAT_Std
- Bwd_IAT_Max
- Bwd_IAT_Min
- Avg_Bwd_Segment_Size
- Subflow_Bwd_Bytes
- Fwd_Act_Data_Packets
- For the above features, a large number of records are clustered in lower range. Thus, these features may have many zero values, or many constant values in lower range of data-points.
- Since the features are concenterated on lower range of data, they are positively skewed.
- If the features have significant number of records with same value, we will need to analyze if they can help to differentiate between Malicious or Benign records.
- Following features in the sampled dataset have equal Q1 and Q3 value: -
- Init_Fwd_Win_Bytes
- Fwd_Seg_Size_Min
- For the above features, a large number of records are clustered at a single value. Thus, these features may have very low variability and may have many constant values.
- If the features have significant number of records with same value, we will need to analyze if they can help to differentiate between Malicious or Benign records.
- There are no features in the sampled dataset having equal Q3 and maximum value.
- Thus, there are no features having same value for 75th percentile and maximum value.
- As the result, all features in upper range have high variability and are spread out (not concenterated around a single value or a group of values).
- Thus, we do not have any features which are negatively skewed.
#Fetching the Q1 value in list of features having equal min and Q1 value
for col in columns_equal_min_and_Q1:
print("Feature name: ",col," , Q1 value: ",sampled_cic_df[col].quantile(0.25))
Feature name: Bwd_Packets_Length_Total , Q1 value: 0.0 Feature name: Fwd_Packet_Length_Std , Q1 value: 0.0 Feature name: Bwd_Packet_Length_Max , Q1 value: 0.0 Feature name: Bwd_Packet_Length_Mean , Q1 value: 0.0 Feature name: Bwd_Packet_Length_Std , Q1 value: 0.0 Feature name: Flow_IAT_Std , Q1 value: 0.0 Feature name: Fwd_IAT_Std , Q1 value: 0.0 Feature name: Bwd_IAT_Total , Q1 value: 0.0 Feature name: Bwd_IAT_Mean , Q1 value: 0.0 Feature name: Bwd_IAT_Std , Q1 value: 0.0 Feature name: Bwd_IAT_Max , Q1 value: 0.0 Feature name: Bwd_IAT_Min , Q1 value: 0.0 Feature name: Avg_Bwd_Segment_Size , Q1 value: 0.0 Feature name: Subflow_Bwd_Bytes , Q1 value: 0.0 Feature name: Fwd_Act_Data_Packets , Q1 value: 0.0
From the above results, we observed 25% of the values among list of features having equal min and Q1 value are equal to 0.
#Fetching the Q1 value and Q3 value in list of features having equal Q1 and Q3 value
for col in columns_equal_Q1_and_Q3:
print("Feature name: ",col," , Q1 value: ",sampled_cic_df[col].quantile(0.25), " , Q3 value: ",sampled_cic_df[col].quantile(0.75))
Feature name: Init_Fwd_Win_Bytes , Q1 value: 8192.0 , Q3 value: 8192.0 Feature name: Fwd_Seg_Size_Min , Q1 value: 20.0 , Q3 value: 20.0
sampled_cic_df['Init_Fwd_Win_Bytes'].median()
8192.0
sampled_cic_df['Fwd_Seg_Size_Min'].median()
20.0
From the above results we observed 50% of the values in the below two features have constant value: -
- Init_Fwd_Win_Bytes : 8192.0
- Fwd_Seg_Size_Min : 20.0
#Plotting charts to see if zero values have any differentiation among the two classes
for col in columns_equal_min_and_Q1:
zero_class="_zero"
col_name=col+zero_class
sampled_cic_df[col_name]=(sampled_cic_df[col] == 0).astype(int)
grouped_data = sampled_cic_df.groupby([col_name, 'isMalicious']).size().unstack()
ax = grouped_data.plot(kind='bar', stacked=True)
x_label_value=col+ "(0: Non-zero, 1: Zero)"
plt.xlabel(x_label_value)
plt.ylabel('Count')
title_value="Comparison of isMalicious for Zero and Non-Zero "+col
plt.title(title_value,pad=20)
labels = ['No', 'Yes']
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, labels, title='isMalicious')
plt.show()
#Plotting charts to see if equal Q1, median and Q3 values have any differentiation among the two classes
for col in columns_equal_Q1_and_Q3:
mid_range_class="_mid_range"
col_name=col+mid_range_class
mid_value=sampled_cic_df[col].median()
sampled_cic_df[col_name]=(sampled_cic_df[col] == mid_value).astype(int)
grouped_data = sampled_cic_df.groupby([col_name, 'isMalicious']).size().unstack()
ax = grouped_data.plot(kind='bar', stacked=True)
x_label_value=col+ "(0: Not mid-range, 1: Mid-range)"
plt.xlabel(x_label_value)
plt.ylabel('Count')
title_value="Comparison of isMalicious for Mid-range and Non-mid-range "+col
plt.title(title_value,pad=20)
labels = ['No', 'Yes']
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, labels, title='isMalicious')
plt.show()
Thus, we observed that creating new categories for two sets of features does not help us get any useful information to differentiate Malicious events from Benign events.
sampled_cic_df.ClassLabel.value_counts()
ClassLabel Benign 1437467 DDoS 246982 DoS 79186 Botnet 29348 Bruteforce 20546 Infiltration 18870 Webattack 625 Portscan 430 Name: count, dtype: int64
labels_and_counts={
'Benign': 1437467,
'DDoS': 246982,
'DoS': 79186,
'Botnet': 29348,
'Bruteforce': 20546,
'Infiltration': 18870,
'Webattack': 625,
'Portscan': 430
}
#Checking if rows with each label are unique in Sampled dataset
for label, count in labels_and_counts.items():
rows_with_labels=sampled_cic_df[sampled_cic_df['ClassLabel']==label]
if not rows_with_labels.duplicated().any():
print(f"All {count} rows with label '{label}' are unique.")
else:
print(f"There are duplicates in {count} rows with label '{label}'.")
There are duplicates in 1437467 rows with label 'Benign'. There are duplicates in 246982 rows with label 'DDoS'. There are duplicates in 79186 rows with label 'DoS'. All 29348 rows with label 'Botnet' are unique. All 20546 rows with label 'Bruteforce' are unique. There are duplicates in 18870 rows with label 'Infiltration'. There are duplicates in 625 rows with label 'Webattack'. All 430 rows with label 'Portscan' are unique.
sampled_cic_df.head()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Bwd_IAT_Total_zero | Bwd_IAT_Mean_zero | Bwd_IAT_Std_zero | Bwd_IAT_Max_zero | Bwd_IAT_Min_zero | Avg_Bwd_Segment_Size_zero | Subflow_Bwd_Bytes_zero | Fwd_Act_Data_Packets_zero | Init_Fwd_Win_Bytes_mid_range | Fwd_Seg_Size_Min_mid_range | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 |
5 rows × 66 columns
suffix="_zero"
sampled_cic_df=sampled_cic_df.drop(columns=sampled_cic_df.filter(like=suffix).columns)
sampled_cic_df.head()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Subflow_Bwd_Bytes | Init_Fwd_Win_Bytes | Init_Bwd_Win_Bytes | Fwd_Act_Data_Packets | Fwd_Seg_Size_Min | ClassLabel | isMalicious | attack_id | Init_Fwd_Win_Bytes_mid_range | Fwd_Seg_Size_Min_mid_range | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 397.0 | 219.0 | 211.0 | 1.0 | 32.0 | Benign | 0 | 0 | 0 | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 63326.0 | 235.0 | 0.0 | 20.0 | Benign | 0 | 0 | 0 | 1 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 1581.0 | 8192.0 | 62856.0 | 5.0 | 20.0 | Benign | 0 | 0 | 1 | 1 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 0.0 | 8192.0 | 235.0 | 5.0 | 8.0 | DDoS | 1 | 3 | 1 | 0 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 232.0 | 8192.0 | 123.0 | 3.0 | 20.0 | Benign | 0 | 0 | 1 | 1 |
5 rows × 51 columns
suffix="_mid_range"
sampled_cic_df=sampled_cic_df.drop(columns=sampled_cic_df.filter(like=suffix).columns)
sampled_cic_df.head()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Subflow_Fwd_Bytes | Subflow_Bwd_Packets | Subflow_Bwd_Bytes | Init_Fwd_Win_Bytes | Init_Bwd_Win_Bytes | Fwd_Act_Data_Packets | Fwd_Seg_Size_Min | ClassLabel | isMalicious | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 935.0 | 3.0 | 397.0 | 219.0 | 211.0 | 1.0 | 32.0 | Benign | 0 | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 63326.0 | 235.0 | 0.0 | 20.0 | Benign | 0 | 0 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 1144.0 | 7.0 | 1581.0 | 8192.0 | 62856.0 | 5.0 | 20.0 | Benign | 0 | 0 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 2064.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 5.0 | 8.0 | DDoS | 1 | 3 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 355.0 | 6.0 | 232.0 | 8192.0 | 123.0 | 3.0 | 20.0 | Benign | 0 | 0 |
5 rows × 49 columns
sampled_cic_df.columns
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
'Init_Bwd_Win_Bytes', 'Fwd_Act_Data_Packets', 'Fwd_Seg_Size_Min',
'ClassLabel', 'isMalicious', 'attack_id'],
dtype='object')
sampled_cic_df.shape
(1833454, 49)
sampled_cic_df.ClassLabel.value_counts()
ClassLabel Benign 1437467 DDoS 246982 DoS 79186 Botnet 29348 Bruteforce 20546 Infiltration 18870 Webattack 625 Portscan 430 Name: count, dtype: int64
sampled_cic_df.drop_duplicates(subset=sampled_cic_df.columns[:-1], keep='first')
sampled_cic_df.shape
(1833454, 49)
#List of labels to keep
labels_to_keep=['Benign','DDoS','Botnet','Bruteforce']
#Filtering the sampled dataset to only keep rows with the above set of labels
sampled_cic_df=sampled_cic_df[sampled_cic_df['ClassLabel'].isin(labels_to_keep)]
sampled_cic_df.shape
(1734343, 49)
sampled_cic_df.head()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Subflow_Fwd_Bytes | Subflow_Bwd_Packets | Subflow_Bwd_Bytes | Init_Fwd_Win_Bytes | Init_Bwd_Win_Bytes | Fwd_Act_Data_Packets | Fwd_Seg_Size_Min | ClassLabel | isMalicious | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 935.0 | 3.0 | 397.0 | 219.0 | 211.0 | 1.0 | 32.0 | Benign | 0 | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 63326.0 | 235.0 | 0.0 | 20.0 | Benign | 0 | 0 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 1144.0 | 7.0 | 1581.0 | 8192.0 | 62856.0 | 5.0 | 20.0 | Benign | 0 | 0 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 2064.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 5.0 | 8.0 | DDoS | 1 | 3 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 355.0 | 6.0 | 232.0 | 8192.0 | 123.0 | 3.0 | 20.0 | Benign | 0 | 0 |
5 rows × 49 columns
sampled_cic_df.ClassLabel.value_counts()
ClassLabel Benign 1437467 DDoS 246982 Botnet 29348 Bruteforce 20546 Name: count, dtype: int64
- Assuming that the sampled dataset's distribution of Class Labels is same as main dataset's distribution of Class Labels.
- We shall filter the data and keep the rows with only with 4 types of attacks: 'Benign','DDoS','Botnet','Bruteforce' and Benign records.
- However, due to large file size of main dataset, we get out of memory error when we try to drop the rows.
- Thus, we shall perform our modelling on sampled dataset.
sampled_cic_df.columns
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
'Init_Bwd_Win_Bytes', 'Fwd_Act_Data_Packets', 'Fwd_Seg_Size_Min',
'ClassLabel', 'isMalicious', 'attack_id'],
dtype='object')
- Since we had perfomed Label encoding on ClassLabel and stored the results earlier, the encoded values for type of attack will have gap after we dropped the records.
- Thus, we will drop the column: attack_id, and redo label encdoing on the sampled dataset.
sampled_cic_df=sampled_cic_df.drop('attack_id',axis=1)
sampled_cic_df.columns
Index(['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets',
'Fwd_Packets_Length_Total', 'Bwd_Packets_Length_Total',
'Fwd_Packet_Length_Max', 'Fwd_Packet_Length_Mean',
'Fwd_Packet_Length_Std', 'Bwd_Packet_Length_Max',
'Bwd_Packet_Length_Mean', 'Bwd_Packet_Length_Std', 'Flow_Bytes/s',
'Flow_Packets/s', 'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max',
'Flow_IAT_Min', 'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std',
'Fwd_IAT_Max', 'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean',
'Bwd_IAT_Std', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_Header_Length',
'Bwd_Header_Length', 'Fwd_Packets/s', 'Bwd_Packets/s',
'Packet_Length_Max', 'Packet_Length_Mean', 'Packet_Length_Std',
'Packet_Length_Variance', 'Avg_Packet_Size', 'Avg_Fwd_Segment_Size',
'Avg_Bwd_Segment_Size', 'Subflow_Fwd_Packets', 'Subflow_Fwd_Bytes',
'Subflow_Bwd_Packets', 'Subflow_Bwd_Bytes', 'Init_Fwd_Win_Bytes',
'Init_Bwd_Win_Bytes', 'Fwd_Act_Data_Packets', 'Fwd_Seg_Size_Min',
'ClassLabel', 'isMalicious'],
dtype='object')
sampled_cic_df["attack_id"]=le.fit_transform(sampled_cic_df["ClassLabel"])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Attack id of each distinct value in field ClassLabel:", label_mapping)
Attack id of each distinct value in field ClassLabel: {'Benign': 0, 'Botnet': 1, 'Bruteforce': 2, 'DDoS': 3}
sampled_cic_df.shape
(1734343, 49)
sampled_cic_df.head()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Subflow_Fwd_Bytes | Subflow_Bwd_Packets | Subflow_Bwd_Bytes | Init_Fwd_Win_Bytes | Init_Bwd_Win_Bytes | Fwd_Act_Data_Packets | Fwd_Seg_Size_Min | ClassLabel | isMalicious | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 935.0 | 3.0 | 397.0 | 219.0 | 211.0 | 1.0 | 32.0 | Benign | 0 | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 63326.0 | 235.0 | 0.0 | 20.0 | Benign | 0 | 0 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 1144.0 | 7.0 | 1581.0 | 8192.0 | 62856.0 | 5.0 | 20.0 | Benign | 0 | 0 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 2064.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 5.0 | 8.0 | DDoS | 1 | 3 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 355.0 | 6.0 | 232.0 | 8192.0 | 123.0 | 3.0 | 20.0 | Benign | 0 | 0 |
5 rows × 49 columns
sampled_cic_df.tail()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Subflow_Fwd_Bytes | Subflow_Bwd_Packets | Subflow_Bwd_Bytes | Init_Fwd_Win_Bytes | Init_Bwd_Win_Bytes | Fwd_Act_Data_Packets | Fwd_Seg_Size_Min | ClassLabel | isMalicious | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1606912 | 189583.0 | 10.0 | 8.0 | 496.0 | 232.0 | 192.0 | 49.599998 | 77.654793 | 1460.0 | 108.0 | ... | 496.0 | 8.0 | 232.0 | 8192.0 | 31.0 | 4.0 | 20.0 | Benign | 0 | 0 |
| 7433839 | 3000787.0 | 4.0 | 0.0 | 2064.0 | 0.0 | 516.0 | 44.000000 | 0.000000 | 0.0 | 0.0 | ... | 2064.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 3.0 | 20.0 | DDoS | 1 | 3 |
| 2510144 | 40.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 8192.0 | 16625.0 | 0.0 | 8.0 | Benign | 0 | 0 |
| 760618 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 279.0 | 235.0 | 0.0 | 20.0 | Benign | 0 | 0 |
| 7134908 | 4097.0 | 3.0 | 0.0 | 97.0 | 0.0 | 440.0 | 44.000000 | 27.430199 | 0.0 | 0.0 | ... | 97.0 | 0.0 | 0.0 | 8192.0 | 235.0 | 1.0 | 20.0 | DDoS | 1 | 3 |
5 rows × 49 columns
#We will drop the column: ClassLabel. Thus, we will have two target features: isMalicious, attack_id
sampled_cic_df=sampled_cic_df.drop('ClassLabel',axis=1)
#We will drop the column: Init Bwd Win Bytes. This is based on obbservations from Pyramid chart.
sampled_cic_df=sampled_cic_df.drop('Init_Bwd_Win_Bytes',axis=1)
sampled_cic_df.shape
(1734343, 47)
sampled_cic_df.head()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Avg_Bwd_Segment_Size | Subflow_Fwd_Packets | Subflow_Fwd_Bytes | Subflow_Bwd_Packets | Subflow_Bwd_Bytes | Init_Fwd_Win_Bytes | Fwd_Act_Data_Packets | Fwd_Seg_Size_Min | isMalicious | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5968290 | 3813760.0 | 5.0 | 3.0 | 935.0 | 397.0 | 935.0 | 187.000 | 418.144714 | 397.0 | 132.333328 | ... | 132.333328 | 5.0 | 935.0 | 3.0 | 397.0 | 219.0 | 1.0 | 32.0 | 0 | 0 |
| 8285216 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000 | 0.000000 | 0.0 | 0.000000 | ... | 0.000000 | 2.0 | 0.0 | 0.0 | 0.0 | 63326.0 | 0.0 | 20.0 | 0 | 0 |
| 8349977 | 1914354.0 | 8.0 | 7.0 | 1144.0 | 1581.0 | 677.0 | 143.000 | 227.969925 | 1173.0 | 225.857147 | ... | 225.857147 | 8.0 | 1144.0 | 7.0 | 1581.0 | 8192.0 | 5.0 | 20.0 | 0 | 0 |
| 7180832 | 4002.0 | 6.0 | 0.0 | 2064.0 | 0.0 | 440.0 | 44.000 | 148.722565 | 0.0 | 0.000000 | ... | 0.000000 | 6.0 | 2064.0 | 0.0 | 0.0 | 8192.0 | 5.0 | 8.0 | 1 | 3 |
| 2324438 | 5368715.0 | 8.0 | 6.0 | 355.0 | 232.0 | 198.0 | 44.375 | 75.864426 | 1460.0 | 108.000000 | ... | 108.000000 | 8.0 | 355.0 | 6.0 | 232.0 | 8192.0 | 3.0 | 20.0 | 0 | 0 |
5 rows × 47 columns
sampled_cic_df.tail()
| Flow_Duration | Total_Fwd_Packets | Total_Backward_Packets | Fwd_Packets_Length_Total | Bwd_Packets_Length_Total | Fwd_Packet_Length_Max | Fwd_Packet_Length_Mean | Fwd_Packet_Length_Std | Bwd_Packet_Length_Max | Bwd_Packet_Length_Mean | ... | Avg_Bwd_Segment_Size | Subflow_Fwd_Packets | Subflow_Fwd_Bytes | Subflow_Bwd_Packets | Subflow_Bwd_Bytes | Init_Fwd_Win_Bytes | Fwd_Act_Data_Packets | Fwd_Seg_Size_Min | isMalicious | attack_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1606912 | 189583.0 | 10.0 | 8.0 | 496.0 | 232.0 | 192.0 | 49.599998 | 77.654793 | 1460.0 | 108.0 | ... | 108.0 | 10.0 | 496.0 | 8.0 | 232.0 | 8192.0 | 4.0 | 20.0 | 0 | 0 |
| 7433839 | 3000787.0 | 4.0 | 0.0 | 2064.0 | 0.0 | 516.0 | 44.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 4.0 | 2064.0 | 0.0 | 0.0 | 8192.0 | 3.0 | 20.0 | 1 | 3 |
| 2510144 | 40.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 8192.0 | 0.0 | 8.0 | 0 | 0 |
| 760618 | 396839.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | ... | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 279.0 | 0.0 | 20.0 | 0 | 0 |
| 7134908 | 4097.0 | 3.0 | 0.0 | 97.0 | 0.0 | 440.0 | 44.000000 | 27.430199 | 0.0 | 0.0 | ... | 0.0 | 3.0 | 97.0 | 0.0 | 0.0 | 8192.0 | 1.0 | 20.0 | 1 | 3 |
5 rows × 47 columns
#We will store the data in sampled_cic_df in a new .parquet file which can later be used for feature selection and training the models.
sampled_cic_df.to_parquet('processed_dataset.parquet')